aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2016-01-21 05:17:54 -0500
committerChris Mason <clm@fb.com>2016-01-25 19:50:26 -0500
commitde0ee0edb21fbab4c7afa3e94573ecfebfb0244e (patch)
tree52b0a9db694a7810bed41f04fa5be2b843352484 /fs/btrfs/tree-log.c
parent6b5aa88c861cf0e4156e490009e2018d4fc81109 (diff)
Btrfs: fix race between fsync and lockless direct IO writes
An fsync, using the fast path, can race with a concurrent lockless direct IO write and end up logging a file extent item that points to an extent that wasn't written to yet. This is because the fast fsync path collects ordered extents into a local list and then collects all the new extent maps to log file extent items based on them, while the direct IO write path creates the new extent map before it creates the corresponding ordered extent (and submitting the respective bio(s)). So fix this by making the direct IO write path create ordered extents before the extent maps and make the fast fsync path collect any new ordered extents after it collects the extent maps. Note that making the fsync handler call inode_dio_wait() (after acquiring the inode's i_mutex) would not work and lead to a deadlock when doing AIO, as through AIO we end up in a path where the fsync handler is called (through dio_aio_complete_work() -> dio_complete() -> vfs_fsync_range()) before the inode's dio counter is decremented (inode_dio_wait() waits for this counter to have a value of zero). Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c14
1 files changed, 11 insertions, 3 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9d2f..978c3a810893 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4127 struct inode *inode, 4127 struct inode *inode,
4128 struct btrfs_path *path, 4128 struct btrfs_path *path,
4129 struct list_head *logged_list, 4129 struct list_head *logged_list,
4130 struct btrfs_log_ctx *ctx) 4130 struct btrfs_log_ctx *ctx,
4131 const u64 start,
4132 const u64 end)
4131{ 4133{
4132 struct extent_map *em, *n; 4134 struct extent_map *em, *n;
4133 struct list_head extents; 4135 struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4166 } 4168 }
4167 4169
4168 list_sort(NULL, &extents, extent_cmp); 4170 list_sort(NULL, &extents, extent_cmp);
4169 4171 /*
4172 * Collect any new ordered extents within the range. This is to
4173 * prevent logging file extent items without waiting for the disk
4174 * location they point to being written. We do this only to deal
4175 * with races against concurrent lockless direct IO writes.
4176 */
4177 btrfs_get_logged_extents(inode, logged_list, start, end);
4170process: 4178process:
4171 while (!list_empty(&extents)) { 4179 while (!list_empty(&extents)) {
4172 em = list_entry(extents.next, struct extent_map, list); 4180 em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
4701 goto out_unlock; 4709 goto out_unlock;
4702 } 4710 }
4703 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4711 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4704 &logged_list, ctx); 4712 &logged_list, ctx, start, end);
4705 if (ret) { 4713 if (ret) {
4706 err = ret; 4714 err = ret;
4707 goto out_unlock; 4715 goto out_unlock;