aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2016-05-12 08:53:36 -0400
committerFilipe Manana <fdmanana@suse.com>2016-05-12 20:59:36 -0400
commit5f9a8a51d8b95505d8de8b7191ae2ed8c504d4af (patch)
treed97a7f5d321694e09c3046e9027c23a02d6a5878 /fs/btrfs/tree-log.c
parentf78c436c3931e7df713688028f2b4faf72bf9f2a (diff)
Btrfs: add semaphore to synchronize direct IO writes with fsync
Due to the optimization of lockless direct IO writes (the inode's i_mutex is not held) introduced in commit 38851cc19adb ("Btrfs: implement unlocked dio write"), we started having races between such writes with concurrent fsync operations that use the fast fsync path. These races were addressed in the patches titled "Btrfs: fix race between fsync and lockless direct IO writes" and "Btrfs: fix race between fsync and direct IO writes for prealloc extents". The races happened because the direct IO path, like every other write path, does create extent maps followed by the corresponding ordered extents while the fast fsync path collected first ordered extents and then it collected extent maps. This made it possible to log file extent items (based on the collected extent maps) without waiting for the corresponding ordered extents to complete (get their IO done). The two fixes mentioned before added a solution that consists of making the direct IO path create first the ordered extents and then the extent maps, while the fsync path attempts to collect any new ordered extents once it collects the extent maps. This was simple and did not require adding any synchonization primitive to any data structure (struct btrfs_inode for example) but it makes things more fragile for future development endeavours and adds an exceptional approach compared to the other write paths. This change adds a read-write semaphore to the btrfs inode structure and makes the direct IO path create the extent maps and the ordered extents while holding read access on that semaphore, while the fast fsync path collects extent maps and ordered extents while holding write access on that semaphore. The logic for direct IO write path is encapsulated in a new helper function that is used both for cow and nocow direct IO writes. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Josef Bacik <jbacik@fb.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c51
1 files changed, 14 insertions, 37 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a24a0ba523d6..003a826f4cff 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4141 4141
4142 INIT_LIST_HEAD(&extents); 4142 INIT_LIST_HEAD(&extents);
4143 4143
4144 down_write(&BTRFS_I(inode)->dio_sem);
4144 write_lock(&tree->lock); 4145 write_lock(&tree->lock);
4145 test_gen = root->fs_info->last_trans_committed; 4146 test_gen = root->fs_info->last_trans_committed;
4146 4147
@@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4169 } 4170 }
4170 4171
4171 list_sort(NULL, &extents, extent_cmp); 4172 list_sort(NULL, &extents, extent_cmp);
4173 btrfs_get_logged_extents(inode, logged_list, start, end);
4172 /* 4174 /*
4173 * Collect any new ordered extents within the range. This is to 4175 * Some ordered extents started by fsync might have completed
4174 * prevent logging file extent items without waiting for the disk 4176 * before we could collect them into the list logged_list, which
4175 * location they point to being written. We do this only to deal 4177 * means they're gone, not in our logged_list nor in the inode's
4176 * with races against concurrent lockless direct IO writes. 4178 * ordered tree. We want the application/user space to know an
4179 * error happened while attempting to persist file data so that
4180 * it can take proper action. If such error happened, we leave
4181 * without writing to the log tree and the fsync must report the
4182 * file data write error and not commit the current transaction.
4177 */ 4183 */
4178 btrfs_get_logged_extents(inode, logged_list, start, end); 4184 ret = btrfs_inode_check_errors(inode);
4185 if (ret)
4186 ctx->io_err = ret;
4179process: 4187process:
4180 while (!list_empty(&extents)) { 4188 while (!list_empty(&extents)) {
4181 em = list_entry(extents.next, struct extent_map, list); 4189 em = list_entry(extents.next, struct extent_map, list);
@@ -4202,6 +4210,7 @@ process:
4202 } 4210 }
4203 WARN_ON(!list_empty(&extents)); 4211 WARN_ON(!list_empty(&extents));
4204 write_unlock(&tree->lock); 4212 write_unlock(&tree->lock);
4213 up_write(&BTRFS_I(inode)->dio_sem);
4205 4214
4206 btrfs_release_path(path); 4215 btrfs_release_path(path);
4207 return ret; 4216 return ret;
@@ -4623,23 +4632,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4623 mutex_lock(&BTRFS_I(inode)->log_mutex); 4632 mutex_lock(&BTRFS_I(inode)->log_mutex);
4624 4633
4625 /* 4634 /*
4626 * Collect ordered extents only if we are logging data. This is to
4627 * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
4628 * will process the ordered extents if they still exists at the time,
4629 * because when we collect them we test and set for the flag
4630 * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
4631 * same ordered extents. The consequence for the LOG_INODE_ALL log mode
4632 * not processing the ordered extents is that we end up logging the
4633 * corresponding file extent items, based on the extent maps in the
4634 * inode's extent_map_tree's modified_list, without logging the
4635 * respective checksums (since the may still be only attached to the
4636 * ordered extents and have not been inserted in the csum tree by
4637 * btrfs_finish_ordered_io() yet).
4638 */
4639 if (inode_only == LOG_INODE_ALL)
4640 btrfs_get_logged_extents(inode, &logged_list, start, end);
4641
4642 /*
4643 * a brute force approach to making sure we get the most uptodate 4635 * a brute force approach to making sure we get the most uptodate
4644 * copies of everything. 4636 * copies of everything.
4645 */ 4637 */
@@ -4846,21 +4838,6 @@ log_extents:
4846 goto out_unlock; 4838 goto out_unlock;
4847 } 4839 }
4848 if (fast_search) { 4840 if (fast_search) {
4849 /*
4850 * Some ordered extents started by fsync might have completed
4851 * before we collected the ordered extents in logged_list, which
4852 * means they're gone, not in our logged_list nor in the inode's
4853 * ordered tree. We want the application/user space to know an
4854 * error happened while attempting to persist file data so that
4855 * it can take proper action. If such error happened, we leave
4856 * without writing to the log tree and the fsync must report the
4857 * file data write error and not commit the current transaction.
4858 */
4859 err = btrfs_inode_check_errors(inode);
4860 if (err) {
4861 ctx->io_err = err;
4862 goto out_unlock;
4863 }
4864 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4841 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4865 &logged_list, ctx, start, end); 4842 &logged_list, ctx, start, end);
4866 if (ret) { 4843 if (ret) {