aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2016-01-21 05:17:54 -0500
committerChris Mason <clm@fb.com>2016-01-25 19:50:26 -0500
commitde0ee0edb21fbab4c7afa3e94573ecfebfb0244e (patch)
tree52b0a9db694a7810bed41f04fa5be2b843352484 /fs
parent6b5aa88c861cf0e4156e490009e2018d4fc81109 (diff)
Btrfs: fix race between fsync and lockless direct IO writes
An fsync, using the fast path, can race with a concurrent lockless direct IO write and end up logging a file extent item that points to an extent that wasn't written to yet. This is because the fast fsync path collects ordered extents into a local list and then collects all the new extent maps to log file extent items based on them, while the direct IO write path creates the new extent map before it creates the corresponding ordered extent (and submitting the respective bio(s)). So fix this by making the direct IO write path create ordered extents before the extent maps and make the fast fsync path collect any new ordered extents after it collects the extent maps. Note that making the fsync handler call inode_dio_wait() (after acquiring the inode's i_mutex) would not work and lead to a deadlock when doing AIO, as through AIO we end up in a path where the fsync handler is called (through dio_aio_complete_work() -> dio_complete() -> vfs_fsync_range()) before the inode's dio counter is decremented (inode_dio_wait() waits for this counter to have a value of zero). Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/inode.c36
-rw-r--r--fs/btrfs/tree-log.c14
2 files changed, 39 insertions, 11 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b8bb7591ff9f..e4565456eb01 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7115,21 +7115,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7115 if (ret) 7115 if (ret)
7116 return ERR_PTR(ret); 7116 return ERR_PTR(ret);
7117 7117
7118 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, 7118 /*
7119 ins.offset, ins.offset, ins.offset, 0); 7119 * Create the ordered extent before the extent map. This is to avoid
7120 if (IS_ERR(em)) { 7120 * races with the fast fsync path that would lead to it logging file
7121 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 7121 * extent items that point to disk extents that were not yet written to.
7122 return em; 7122 * The fast fsync path collects ordered extents into a local list and
7123 } 7123 * then collects all the new extent maps, so we must create the ordered
7124 7124 * extent first and make sure the fast fsync path collects any new
7125 * ordered extents after collecting new extent maps as well.
7126 * The fsync path simply can not rely on inode_dio_wait() because it
7127 * causes deadlock with AIO.
7128 */
7125 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 7129 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
7126 ins.offset, ins.offset, 0); 7130 ins.offset, ins.offset, 0);
7127 if (ret) { 7131 if (ret) {
7128 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 7132 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7129 free_extent_map(em);
7130 return ERR_PTR(ret); 7133 return ERR_PTR(ret);
7131 } 7134 }
7132 7135
7136 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
7137 ins.offset, ins.offset, ins.offset, 0);
7138 if (IS_ERR(em)) {
7139 struct btrfs_ordered_extent *oe;
7140
7141 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7142 oe = btrfs_lookup_ordered_extent(inode, start);
7143 ASSERT(oe);
7144 if (WARN_ON(!oe))
7145 return em;
7146 set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
7147 set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
7148 btrfs_remove_ordered_extent(inode, oe);
7149 /* Once for our lookup and once for the ordered extents tree. */
7150 btrfs_put_ordered_extent(oe);
7151 btrfs_put_ordered_extent(oe);
7152 }
7133 return em; 7153 return em;
7134} 7154}
7135 7155
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9d2f..978c3a810893 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4127 struct inode *inode, 4127 struct inode *inode,
4128 struct btrfs_path *path, 4128 struct btrfs_path *path,
4129 struct list_head *logged_list, 4129 struct list_head *logged_list,
4130 struct btrfs_log_ctx *ctx) 4130 struct btrfs_log_ctx *ctx,
4131 const u64 start,
4132 const u64 end)
4131{ 4133{
4132 struct extent_map *em, *n; 4134 struct extent_map *em, *n;
4133 struct list_head extents; 4135 struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4166 } 4168 }
4167 4169
4168 list_sort(NULL, &extents, extent_cmp); 4170 list_sort(NULL, &extents, extent_cmp);
4169 4171 /*
4172 * Collect any new ordered extents within the range. This is to
4173 * prevent logging file extent items without waiting for the disk
4174 * location they point to being written. We do this only to deal
4175 * with races against concurrent lockless direct IO writes.
4176 */
4177 btrfs_get_logged_extents(inode, logged_list, start, end);
4170process: 4178process:
4171 while (!list_empty(&extents)) { 4179 while (!list_empty(&extents)) {
4172 em = list_entry(extents.next, struct extent_map, list); 4180 em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
4701 goto out_unlock; 4709 goto out_unlock;
4702 } 4710 }
4703 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4711 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4704 &logged_list, ctx); 4712 &logged_list, ctx, start, end);
4705 if (ret) { 4713 if (ret) {
4706 err = ret; 4714 err = ret;
4707 goto out_unlock; 4715 goto out_unlock;