aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2012-06-19 10:59:00 -0400
committerChris Mason <chris.mason@fusionio.com>2012-07-02 15:36:23 -0400
commitc3473e830074ef04f974f2829690942dd8580619 (patch)
tree21e5e5117bffcf4cdb01e6985346747aeccd331e /fs/btrfs
parent597a60fadedf9a40fdff8735054bf772b3dafd57 (diff)
Btrfs: fix dio write vs buffered read race
Miao pointed out there's a problem with mixing dio writes and buffered reads. If the read happens between us invalidating the page range and actually locking the extent we can bring in pages into page cache. Then once the write finishes if somebody tries to read again it will just find uptodate pages and we'll read stale data. So we need to lock the extent and check for uptodate bits in the range. If there are uptodate bits we need to unlock and invalidate again. This will keep this race from happening since we will hold the extent locked until we create the ordered extent, and then teh read side always waits for ordered extents. There was also a race in how we updated i_size, previously we were relying on the generic DIO stuff to adjust the i_size after the DIO had completed, but this happens outside of the extent lock which means reads could come in and not see the updated i_size. So instead move this work into where we create the extents, and then this way the update ordered i_size stuff works properly in the endio handlers. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/file.c13
-rw-r--r--fs/btrfs/inode.c55
2 files changed, 50 insertions, 18 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 876cddd6b2f0..248d20265249 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1334,7 +1334,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1334 loff_t *ppos, size_t count, size_t ocount) 1334 loff_t *ppos, size_t count, size_t ocount)
1335{ 1335{
1336 struct file *file = iocb->ki_filp; 1336 struct file *file = iocb->ki_filp;
1337 struct inode *inode = fdentry(file)->d_inode;
1338 struct iov_iter i; 1337 struct iov_iter i;
1339 ssize_t written; 1338 ssize_t written;
1340 ssize_t written_buffered; 1339 ssize_t written_buffered;
@@ -1344,18 +1343,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1344 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, 1343 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1345 count, ocount); 1344 count, ocount);
1346 1345
1347 /*
1348 * the generic O_DIRECT will update in-memory i_size after the
1349 * DIOs are done. But our endio handlers that update the on
1350 * disk i_size never update past the in memory i_size. So we
1351 * need one more update here to catch any additions to the
1352 * file
1353 */
1354 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
1355 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
1356 mark_inode_dirty(inode);
1357 }
1358
1359 if (written < 0 || written == count) 1346 if (written < 0 || written == count)
1360 return written; 1347 return written;
1361 1348
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a4f2d59a64b..6971fb5fc859 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5904,8 +5904,17 @@ map:
5904 bh_result->b_size = len; 5904 bh_result->b_size = len;
5905 bh_result->b_bdev = em->bdev; 5905 bh_result->b_bdev = em->bdev;
5906 set_buffer_mapped(bh_result); 5906 set_buffer_mapped(bh_result);
5907 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5907 if (create) {
5908 set_buffer_new(bh_result); 5908 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5909 set_buffer_new(bh_result);
5910
5911 /*
5912 * Need to update the i_size under the extent lock so buffered
5913 * readers will get the updated i_size when we unlock.
5914 */
5915 if (start + len > i_size_read(inode))
5916 i_size_write(inode, start + len);
5917 }
5909 5918
5910 free_extent_map(em); 5919 free_extent_map(em);
5911 5920
@@ -6388,12 +6397,48 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6388 */ 6397 */
6389 ordered = btrfs_lookup_ordered_range(inode, lockstart, 6398 ordered = btrfs_lookup_ordered_range(inode, lockstart,
6390 lockend - lockstart + 1); 6399 lockend - lockstart + 1);
6391 if (!ordered) 6400
6401 /*
6402 * We need to make sure there are no buffered pages in this
6403 * range either, we could have raced between the invalidate in
6404 * generic_file_direct_write and locking the extent. The
6405 * invalidate needs to happen so that reads after a write do not
6406 * get stale data.
6407 */
6408 if (!ordered && (!writing ||
6409 !test_range_bit(&BTRFS_I(inode)->io_tree,
6410 lockstart, lockend, EXTENT_UPTODATE, 0,
6411 cached_state)))
6392 break; 6412 break;
6413
6393 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6414 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6394 &cached_state, GFP_NOFS); 6415 &cached_state, GFP_NOFS);
6395 btrfs_start_ordered_extent(inode, ordered, 1); 6416
6396 btrfs_put_ordered_extent(ordered); 6417 if (ordered) {
6418 btrfs_start_ordered_extent(inode, ordered, 1);
6419 btrfs_put_ordered_extent(ordered);
6420 } else {
6421 /* Screw you mmap */
6422 ret = filemap_write_and_wait_range(file->f_mapping,
6423 lockstart,
6424 lockend);
6425 if (ret)
6426 goto out;
6427
6428 /*
6429 * If we found a page that couldn't be invalidated just
6430 * fall back to buffered.
6431 */
6432 ret = invalidate_inode_pages2_range(file->f_mapping,
6433 lockstart >> PAGE_CACHE_SHIFT,
6434 lockend >> PAGE_CACHE_SHIFT);
6435 if (ret) {
6436 if (ret == -EBUSY)
6437 ret = 0;
6438 goto out;
6439 }
6440 }
6441
6397 cond_resched(); 6442 cond_resched();
6398 } 6443 }
6399 6444