Btrfs: lock extents as we map them in DIO

A deadlock in xfstests 113 was uncovered by commit d187663ef24cd3d033f0cbf2867e70b36a3a90b8 This is because we would not return EIOCBQUEUED for short AIO reads, instead we'd wait for the DIO to complete and then return the amount of data we transferred, which would allow our stuff to unlock the remaning amount. But with this change this no longer happens, so if we have a short AIO read (for example if we try to read past EOF), we could leave the section from EOF to the end of where we tried to read locked. Fixing this is tricky since there is no clear way to know exactly how much data DIO truly submitted for IO, so to make this less hard on ourselves and less combersome we need to lock the extents as we try to map them, and then we unlock any areas we didn't actually map. This makes us completely safe from deadlocks and reliance on a particular behavior of the DIO code. This also lays the groundwork for allowing us to use the normal csum storage method for reads which means we can remove an allocation. Thanks, Signed-off-by: Josef Bacik <jbacik@fusionio.com>
author: Josef Bacik <jbacik@fusionio.com> 2012-07-31 16:28:48 -0400
committer: Chris Mason <chris.mason@oracle.com> 2012-08-28 16:53:27 -0400
commit: eb838e73dc2121d2bae47d5678952cd7d48793b5 (patch)
tree: e451338c2f68b413f1a3f5821ec0d63f8ef60196 /fs
parent: dadd1105ca9a1e506c678e8e410e9623efdda821 (diff)
1 files changed, 127 insertions, 129 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dac1fc21d809..09182449cbdf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5773,18 +5773,109 @@ out:
        return ret;
 }
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+                              struct extent_state **cached_state, int writing)
+{
+        struct btrfs_ordered_extent *ordered;
+        int ret = 0;
+        while (1) {
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                 0, cached_state);
+                /*
+                 * We're concerned with the entire range that we're going to be
+                 * doing DIO to, so we need to make sure theres no ordered
+                 * extents in this range.
+                 */
+                ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                     lockend - lockstart + 1);
+                /*
+                 * We need to make sure there are no buffered pages in this
+                 * range either, we could have raced between the invalidate in
+                 * generic_file_direct_write and locking the extent.  The
+                 * invalidate needs to happen so that reads after a write do not
+                 * get stale data.
+                 */
+                if (!ordered && (!writing ||
+                    !test_range_bit(&BTRFS_I(inode)->io_tree,
+                                    lockstart, lockend, EXTENT_UPTODATE, 0,
+                                    *cached_state)))
+                        break;
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     cached_state, GFP_NOFS);
+                if (ordered) {
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                } else {
+                        /* Screw you mmap */
+                        ret = filemap_write_and_wait_range(inode->i_mapping,
+                                                           lockstart,
+                                                           lockend);
+                        if (ret)
+                                break;
+                        /*
+                         * If we found a page that couldn't be invalidated just
+                         * fall back to buffered.
+                         */
+                        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                        lockstart >> PAGE_CACHE_SHIFT,
+                                        lockend >> PAGE_CACHE_SHIFT);
+                        if (ret)
+                                break;
+                }
+                cond_resched();
+        }
+        return ret;
+}
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
        struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_state *cached_state = NULL;
        u64 start = iblock << inode->i_blkbits;
+        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
        struct btrfs_trans_handle *trans;
+        int unlock_bits = EXTENT_LOCKED;
+        int ret;
+        lockstart = start;
+        lockend = start + len - 1;
+        if (create) {
+                ret = btrfs_delalloc_reserve_space(inode, len);
+                if (ret)
+                        return ret;
+                unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+        }
+        /*
+         * If this errors out it's because we couldn't invalidate pagecache for
+         * this range and we need to fallback to buffered.
+         */
+        if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+                return -ENOTBLK;
+        if (create) {
+                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                     lockend, EXTENT_DELALLOC, NULL,
+                                     &cached_state, GFP_NOFS);
+                if (ret)
+                        goto unlock_err;
+        }
        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-        if (IS_ERR(em))
+        if (IS_ERR(em)) {
-                return PTR_ERR(em);
+                ret = PTR_ERR(em);
+                goto unlock_err;
+        }
        /*
         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5803,17 +5894,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
            em->block_start == EXTENT_MAP_INLINE) {
                free_extent_map(em);
-                return -ENOTBLK;
+                ret = -ENOTBLK;
+                goto unlock_err;
        }
        /* Just a good old fashioned hole, return */
        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
                free_extent_map(em);
-                /* DIO will do one hole at a time, so just unlock a sector */
+                ret = 0;
-                unlock_extent(&BTRFS_I(inode)->io_tree, start,
+                goto unlock_err;
-                              start + root->sectorsize - 1);
-                return 0;
        }
        /*
@@ -5826,8 +5916,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         *
         */
        if (!create) {
-                len = em->len - (start - em->start);
+                len = min(len, em->len - (start - em->start));
-                goto map;
+                lockstart = start + len;
+                goto unlock;
        }
        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5859,7 +5950,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                        btrfs_end_transaction(trans, root);
                        if (ret) {
                                free_extent_map(em);
-                                return ret;
+                                goto unlock_err;
                        }
                        goto unlock;
                }
@@ -5872,14 +5963,12 @@ must_cow:
         */
        len = bh_result->b_size;
        em = btrfs_new_extent_direct(inode, em, start, len);
-        if (IS_ERR(em))
+        if (IS_ERR(em)) {
-                return PTR_ERR(em);
+                ret = PTR_ERR(em);
+                goto unlock_err;
+        }
        len = min(len, em->len - (start - em->start));
 unlock:
-        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
-                          EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
-                          0, NULL, GFP_NOFS);
-map:
        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
                inode->i_blkbits;
        bh_result->b_size = len;
@@ -5897,9 +5986,28 @@ map:
                        i_size_write(inode, start + len);
        }
+        /*
+         * In the case of write we need to clear and unlock the entire range,
+         * in the case of read we need to unlock only the end area that we
+         * aren't using if there is any left over space.
+         */
+        if (lockstart < lockend)
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+        else
+                free_extent_state(cached_state);
        free_extent_map(em);
        return 0;
+unlock_err:
+        if (create)
+                unlock_bits |= EXTENT_DO_ACCOUNTING;
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+        return ret;
 }
 struct btrfs_dio_private {
@@ -6340,132 +6448,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
 out:
        return retval;
 }
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        struct btrfs_ordered_extent *ordered;
-        struct extent_state *cached_state = NULL;
-        u64 lockstart, lockend;
-        ssize_t ret;
-        int writing = rw & WRITE;
-        int write_bits = 0;
-        size_t count = iov_length(iov, nr_segs);
        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
-                            offset, nr_segs)) {
+                            offset, nr_segs))
                return 0;
-        }
-        lockstart = offset;
-        lockend = offset + count - 1;
-        if (writing) {
-                ret = btrfs_delalloc_reserve_space(inode, count);
-                if (ret)
-                        goto out;
-        }
-        while (1) {
+        return __blockdev_direct_IO(rw, iocb, inode,
-                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                 0, &cached_state);
-                /*
-                 * We're concerned with the entire range that we're going to be
-                 * doing DIO to, so we need to make sure theres no ordered
-                 * extents in this range.
-                 */
-                ordered = btrfs_lookup_ordered_range(inode, lockstart,
-                                                     lockend - lockstart + 1);
-                /*
-                 * We need to make sure there are no buffered pages in this
-                 * range either, we could have raced between the invalidate in
-                 * generic_file_direct_write and locking the extent.  The
-                 * invalidate needs to happen so that reads after a write do not
-                 * get stale data.
-                 */
-                if (!ordered && (!writing ||
-                    !test_range_bit(&BTRFS_I(inode)->io_tree,
-                                    lockstart, lockend, EXTENT_UPTODATE, 0,
-                                    cached_state)))
-                        break;
-                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                     &cached_state, GFP_NOFS);
-                if (ordered) {
-                        btrfs_start_ordered_extent(inode, ordered, 1);
-                        btrfs_put_ordered_extent(ordered);
-                } else {
-                        /* Screw you mmap */
-                        ret = filemap_write_and_wait_range(file->f_mapping,
-                                                           lockstart,
-                                                           lockend);
-                        if (ret)
-                                goto out;
-                        /*
-                         * If we found a page that couldn't be invalidated just
-                         * fall back to buffered.
-                         */
-                        ret = invalidate_inode_pages2_range(file->f_mapping,
-                                        lockstart >> PAGE_CACHE_SHIFT,
-                                        lockend >> PAGE_CACHE_SHIFT);
-                        if (ret) {
-                                if (ret == -EBUSY)
-                                        ret = 0;
-                                goto out;
-                        }
-                }
-                cond_resched();
-        }
-        /*
-         * we don't use btrfs_set_extent_delalloc because we don't want
-         * the dirty or uptodate bits
-         */
-        if (writing) {
-                write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
-                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                     EXTENT_DELALLOC, NULL, &cached_state,
-                                     GFP_NOFS);
-                if (ret) {
-                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                         lockend, EXTENT_LOCKED | write_bits,
-                                         1, 0, &cached_state, GFP_NOFS);
-                        goto out;
-                }
-        }
-        free_extent_state(cached_state);
-        cached_state = NULL;
-        ret = __blockdev_direct_IO(rw, iocb, inode,
                   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
                   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
                   btrfs_submit_direct, 0);
-        if (ret < 0 && ret != -EIOCBQUEUED) {
-                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
-                              offset + iov_length(iov, nr_segs) - 1,
-                              EXTENT_LOCKED | write_bits, 1, 0,
-                              &cached_state, GFP_NOFS);
-        } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
-                /*
-                 * We're falling back to buffered, unlock the section we didn't
-                 * do IO on.
-                 */
-                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
-                              offset + iov_length(iov, nr_segs) - 1,
-                              EXTENT_LOCKED | write_bits, 1, 0,
-                              &cached_state, GFP_NOFS);
-        }
-out:
-        free_extent_state(cached_state);
-        return ret;
 }
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
author	Josef Bacik <jbacik@fusionio.com>	2012-07-31 16:28:48 -0400
committer	Chris Mason <chris.mason@oracle.com>	2012-08-28 16:53:27 -0400
commit	eb838e73dc2121d2bae47d5678952cd7d48793b5 (patch)
tree	e451338c2f68b413f1a3f5821ec0d63f8ef60196 /fs
parent	dadd1105ca9a1e506c678e8e410e9623efdda821 (diff)