Btrfs: fix dio write vs buffered read race

Miao pointed out there's a problem with mixing dio writes and buffered reads. If the read happens between us invalidating the page range and actually locking the extent we can bring in pages into page cache. Then once the write finishes if somebody tries to read again it will just find uptodate pages and we'll read stale data. So we need to lock the extent and check for uptodate bits in the range. If there are uptodate bits we need to unlock and invalidate again. This will keep this race from happening since we will hold the extent locked until we create the ordered extent, and then teh read side always waits for ordered extents. There was also a race in how we updated i_size, previously we were relying on the generic DIO stuff to adjust the i_size after the DIO had completed, but this happens outside of the extent lock which means reads could come in and not see the updated i_size. So instead move this work into where we create the extents, and then this way the update ordered i_size stuff works properly in the endio handlers. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com>
author: Josef Bacik <josef@redhat.com> 2012-06-19 10:59:00 -0400
committer: Chris Mason <chris.mason@fusionio.com> 2012-07-02 15:36:23 -0400
commit: c3473e830074ef04f974f2829690942dd8580619 (patch)
tree: 21e5e5117bffcf4cdb01e6985346747aeccd331e /fs/btrfs
parent: 597a60fadedf9a40fdff8735054bf772b3dafd57 (diff)
2 files changed, 50 insertions, 18 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 876cddd6b2f0..248d20265249 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1334,7 +1334,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
                                    loff_t *ppos, size_t count, size_t ocount)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = fdentry(file)->d_inode;
        struct iov_iter i;
        ssize_t written;
        ssize_t written_buffered;
@@ -1344,18 +1343,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
        written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
                                            count, ocount);
-        /*
-         * the generic O_DIRECT will update in-memory i_size after the
-         * DIOs are done.  But our endio handlers that update the on
-         * disk i_size never update past the in memory i_size.  So we
-         * need one more update here to catch any additions to the
-         * file
-         */
-        if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
-                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-                mark_inode_dirty(inode);
-        }
        if (written < 0 || written == count)
                return written;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a4f2d59a64b..6971fb5fc859 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5904,8 +5904,17 @@ map:
        bh_result->b_size = len;
        bh_result->b_bdev = em->bdev;
        set_buffer_mapped(bh_result);
-        if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+        if (create) {
-                set_buffer_new(bh_result);
+                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                        set_buffer_new(bh_result);
+                /*
+                 * Need to update the i_size under the extent lock so buffered
+                 * readers will get the updated i_size when we unlock.
+                 */
+                if (start + len > i_size_read(inode))
+                        i_size_write(inode, start + len);
+        }
        free_extent_map(em);
@@ -6388,12 +6397,48 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                 */
                ordered = btrfs_lookup_ordered_range(inode, lockstart,
                                                     lockend - lockstart + 1);
-                if (!ordered)
+                /*
+                 * We need to make sure there are no buffered pages in this
+                 * range either, we could have raced between the invalidate in
+                 * generic_file_direct_write and locking the extent.  The
+                 * invalidate needs to happen so that reads after a write do not
+                 * get stale data.
+                 */
+                if (!ordered && (!writing ||
+                    !test_range_bit(&BTRFS_I(inode)->io_tree,
+                                    lockstart, lockend, EXTENT_UPTODATE, 0,
+                                    cached_state)))
                        break;
                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                                     &cached_state, GFP_NOFS);
-                btrfs_start_ordered_extent(inode, ordered, 1);
-                btrfs_put_ordered_extent(ordered);
+                if (ordered) {
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                } else {
+                        /* Screw you mmap */
+                        ret = filemap_write_and_wait_range(file->f_mapping,
+                                                           lockstart,
+                                                           lockend);
+                        if (ret)
+                                goto out;
+                        /*
+                         * If we found a page that couldn't be invalidated just
+                         * fall back to buffered.
+                         */
+                        ret = invalidate_inode_pages2_range(file->f_mapping,
+                                        lockstart >> PAGE_CACHE_SHIFT,
+                                        lockend >> PAGE_CACHE_SHIFT);
+                        if (ret) {
+                                if (ret == -EBUSY)
+                                        ret = 0;
+                                goto out;
+                        }
+                }
                cond_resched();
        }
author	Josef Bacik <josef@redhat.com>	2012-06-19 10:59:00 -0400
committer	Chris Mason <chris.mason@fusionio.com>	2012-07-02 15:36:23 -0400
commit	c3473e830074ef04f974f2829690942dd8580619 (patch)
tree	21e5e5117bffcf4cdb01e6985346747aeccd331e /fs/btrfs
parent	597a60fadedf9a40fdff8735054bf772b3dafd57 (diff)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 876cddd6b2f0..248d20265249 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c
@@ -1334,7 +1334,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1334	loff_t *ppos, size_t count, size_t ocount)	1334	loff_t *ppos, size_t count, size_t ocount)
1335	{	1335	{
1336	struct file *file = iocb->ki_filp;	1336	struct file *file = iocb->ki_filp;
1337	struct inode *inode = fdentry(file)->d_inode;
1338	struct iov_iter i;	1337	struct iov_iter i;
1339	ssize_t written;	1338	ssize_t written;
1340	ssize_t written_buffered;	1339	ssize_t written_buffered;
@@ -1344,18 +1343,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1344	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,	1343	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1345	count, ocount);	1344	count, ocount);
1346		1345
1347	/*
1348	* the generic O_DIRECT will update in-memory i_size after the
1349	* DIOs are done. But our endio handlers that update the on
1350	* disk i_size never update past the in memory i_size. So we
1351	* need one more update here to catch any additions to the
1352	* file
1353	*/
1354	if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
1355	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
1356	mark_inode_dirty(inode);
1357	}
1358
1359	if (written < 0 \|\| written == count)	1346	if (written < 0 \|\| written == count)
1360	return written;	1347	return written;
1361		1348


diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4a4f2d59a64b..6971fb5fc859 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c
@@ -5904,8 +5904,17 @@ map:
5904	bh_result->b_size = len;	5904	bh_result->b_size = len;
5905	bh_result->b_bdev = em->bdev;	5905	bh_result->b_bdev = em->bdev;
5906	set_buffer_mapped(bh_result);	5906	set_buffer_mapped(bh_result);
5907	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))	5907	if (create) {
5908	set_buffer_new(bh_result);	5908	if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
		5909	set_buffer_new(bh_result);
		5910
		5911	/*
		5912	* Need to update the i_size under the extent lock so buffered
		5913	* readers will get the updated i_size when we unlock.
		5914	*/
		5915	if (start + len > i_size_read(inode))
		5916	i_size_write(inode, start + len);
		5917	}
5909		5918
5910	free_extent_map(em);	5919	free_extent_map(em);
5911		5920
@@ -6388,12 +6397,48 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6388	*/	6397	*/
6389	ordered = btrfs_lookup_ordered_range(inode, lockstart,	6398	ordered = btrfs_lookup_ordered_range(inode, lockstart,
6390	lockend - lockstart + 1);	6399	lockend - lockstart + 1);
6391	if (!ordered)	6400
		6401	/*
		6402	* We need to make sure there are no buffered pages in this
		6403	* range either, we could have raced between the invalidate in
		6404	* generic_file_direct_write and locking the extent. The
		6405	* invalidate needs to happen so that reads after a write do not
		6406	* get stale data.
		6407	*/
		6408	if (!ordered && (!writing \|\|
		6409	!test_range_bit(&BTRFS_I(inode)->io_tree,
		6410	lockstart, lockend, EXTENT_UPTODATE, 0,
		6411	cached_state)))
6392	break;	6412	break;
		6413
6393	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,	6414	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6394	&cached_state, GFP_NOFS);	6415	&cached_state, GFP_NOFS);
6395	btrfs_start_ordered_extent(inode, ordered, 1);	6416
6396	btrfs_put_ordered_extent(ordered);	6417	if (ordered) {
		6418	btrfs_start_ordered_extent(inode, ordered, 1);
		6419	btrfs_put_ordered_extent(ordered);
		6420	} else {
		6421	/* Screw you mmap */
		6422	ret = filemap_write_and_wait_range(file->f_mapping,
		6423	lockstart,
		6424	lockend);
		6425	if (ret)
		6426	goto out;
		6427
		6428	/*
		6429	* If we found a page that couldn't be invalidated just
		6430	* fall back to buffered.
		6431	*/
		6432	ret = invalidate_inode_pages2_range(file->f_mapping,
		6433	lockstart >> PAGE_CACHE_SHIFT,
		6434	lockend >> PAGE_CACHE_SHIFT);
		6435	if (ret) {
		6436	if (ret == -EBUSY)
		6437	ret = 0;
		6438	goto out;
		6439	}
		6440	}
		6441
6397	cond_resched();	6442	cond_resched();
6398	}	6443	}
6399		6444