aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fusionio.com>2012-07-31 16:28:48 -0400
committerChris Mason <chris.mason@oracle.com>2012-08-28 16:53:27 -0400
commiteb838e73dc2121d2bae47d5678952cd7d48793b5 (patch)
treee451338c2f68b413f1a3f5821ec0d63f8ef60196 /fs
parentdadd1105ca9a1e506c678e8e410e9623efdda821 (diff)
Btrfs: lock extents as we map them in DIO
A deadlock in xfstests 113 was uncovered by commit d187663ef24cd3d033f0cbf2867e70b36a3a90b8 This is because we would not return EIOCBQUEUED for short AIO reads, instead we'd wait for the DIO to complete and then return the amount of data we transferred, which would allow our stuff to unlock the remaning amount. But with this change this no longer happens, so if we have a short AIO read (for example if we try to read past EOF), we could leave the section from EOF to the end of where we tried to read locked. Fixing this is tricky since there is no clear way to know exactly how much data DIO truly submitted for IO, so to make this less hard on ourselves and less combersome we need to lock the extents as we try to map them, and then we unlock any areas we didn't actually map. This makes us completely safe from deadlocks and reliance on a particular behavior of the DIO code. This also lays the groundwork for allowing us to use the normal csum storage method for reads which means we can remove an allocation. Thanks, Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/inode.c256
1 files changed, 127 insertions, 129 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dac1fc21d809..09182449cbdf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5773,18 +5773,109 @@ out:
5773 return ret; 5773 return ret;
5774} 5774}
5775 5775
5776static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5777 struct extent_state **cached_state, int writing)
5778{
5779 struct btrfs_ordered_extent *ordered;
5780 int ret = 0;
5781
5782 while (1) {
5783 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5784 0, cached_state);
5785 /*
5786 * We're concerned with the entire range that we're going to be
5787 * doing DIO to, so we need to make sure theres no ordered
5788 * extents in this range.
5789 */
5790 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5791 lockend - lockstart + 1);
5792
5793 /*
5794 * We need to make sure there are no buffered pages in this
5795 * range either, we could have raced between the invalidate in
5796 * generic_file_direct_write and locking the extent. The
5797 * invalidate needs to happen so that reads after a write do not
5798 * get stale data.
5799 */
5800 if (!ordered && (!writing ||
5801 !test_range_bit(&BTRFS_I(inode)->io_tree,
5802 lockstart, lockend, EXTENT_UPTODATE, 0,
5803 *cached_state)))
5804 break;
5805
5806 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5807 cached_state, GFP_NOFS);
5808
5809 if (ordered) {
5810 btrfs_start_ordered_extent(inode, ordered, 1);
5811 btrfs_put_ordered_extent(ordered);
5812 } else {
5813 /* Screw you mmap */
5814 ret = filemap_write_and_wait_range(inode->i_mapping,
5815 lockstart,
5816 lockend);
5817 if (ret)
5818 break;
5819
5820 /*
5821 * If we found a page that couldn't be invalidated just
5822 * fall back to buffered.
5823 */
5824 ret = invalidate_inode_pages2_range(inode->i_mapping,
5825 lockstart >> PAGE_CACHE_SHIFT,
5826 lockend >> PAGE_CACHE_SHIFT);
5827 if (ret)
5828 break;
5829 }
5830
5831 cond_resched();
5832 }
5833
5834 return ret;
5835}
5836
5776static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5837static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5777 struct buffer_head *bh_result, int create) 5838 struct buffer_head *bh_result, int create)
5778{ 5839{
5779 struct extent_map *em; 5840 struct extent_map *em;
5780 struct btrfs_root *root = BTRFS_I(inode)->root; 5841 struct btrfs_root *root = BTRFS_I(inode)->root;
5842 struct extent_state *cached_state = NULL;
5781 u64 start = iblock << inode->i_blkbits; 5843 u64 start = iblock << inode->i_blkbits;
5844 u64 lockstart, lockend;
5782 u64 len = bh_result->b_size; 5845 u64 len = bh_result->b_size;
5783 struct btrfs_trans_handle *trans; 5846 struct btrfs_trans_handle *trans;
5847 int unlock_bits = EXTENT_LOCKED;
5848 int ret;
5849
5850 lockstart = start;
5851 lockend = start + len - 1;
5852 if (create) {
5853 ret = btrfs_delalloc_reserve_space(inode, len);
5854 if (ret)
5855 return ret;
5856 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
5857 }
5858
5859 /*
5860 * If this errors out it's because we couldn't invalidate pagecache for
5861 * this range and we need to fallback to buffered.
5862 */
5863 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
5864 return -ENOTBLK;
5865
5866 if (create) {
5867 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5868 lockend, EXTENT_DELALLOC, NULL,
5869 &cached_state, GFP_NOFS);
5870 if (ret)
5871 goto unlock_err;
5872 }
5784 5873
5785 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 5874 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5786 if (IS_ERR(em)) 5875 if (IS_ERR(em)) {
5787 return PTR_ERR(em); 5876 ret = PTR_ERR(em);
5877 goto unlock_err;
5878 }
5788 5879
5789 /* 5880 /*
5790 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 5881 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5803,17 +5894,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5803 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 5894 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5804 em->block_start == EXTENT_MAP_INLINE) { 5895 em->block_start == EXTENT_MAP_INLINE) {
5805 free_extent_map(em); 5896 free_extent_map(em);
5806 return -ENOTBLK; 5897 ret = -ENOTBLK;
5898 goto unlock_err;
5807 } 5899 }
5808 5900
5809 /* Just a good old fashioned hole, return */ 5901 /* Just a good old fashioned hole, return */
5810 if (!create && (em->block_start == EXTENT_MAP_HOLE || 5902 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5811 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 5903 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5812 free_extent_map(em); 5904 free_extent_map(em);
5813 /* DIO will do one hole at a time, so just unlock a sector */ 5905 ret = 0;
5814 unlock_extent(&BTRFS_I(inode)->io_tree, start, 5906 goto unlock_err;
5815 start + root->sectorsize - 1);
5816 return 0;
5817 } 5907 }
5818 5908
5819 /* 5909 /*
@@ -5826,8 +5916,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5826 * 5916 *
5827 */ 5917 */
5828 if (!create) { 5918 if (!create) {
5829 len = em->len - (start - em->start); 5919 len = min(len, em->len - (start - em->start));
5830 goto map; 5920 lockstart = start + len;
5921 goto unlock;
5831 } 5922 }
5832 5923
5833 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 5924 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5859,7 +5950,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5859 btrfs_end_transaction(trans, root); 5950 btrfs_end_transaction(trans, root);
5860 if (ret) { 5951 if (ret) {
5861 free_extent_map(em); 5952 free_extent_map(em);
5862 return ret; 5953 goto unlock_err;
5863 } 5954 }
5864 goto unlock; 5955 goto unlock;
5865 } 5956 }
@@ -5872,14 +5963,12 @@ must_cow:
5872 */ 5963 */
5873 len = bh_result->b_size; 5964 len = bh_result->b_size;
5874 em = btrfs_new_extent_direct(inode, em, start, len); 5965 em = btrfs_new_extent_direct(inode, em, start, len);
5875 if (IS_ERR(em)) 5966 if (IS_ERR(em)) {
5876 return PTR_ERR(em); 5967 ret = PTR_ERR(em);
5968 goto unlock_err;
5969 }
5877 len = min(len, em->len - (start - em->start)); 5970 len = min(len, em->len - (start - em->start));
5878unlock: 5971unlock:
5879 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5880 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5881 0, NULL, GFP_NOFS);
5882map:
5883 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 5972 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5884 inode->i_blkbits; 5973 inode->i_blkbits;
5885 bh_result->b_size = len; 5974 bh_result->b_size = len;
@@ -5897,9 +5986,28 @@ map:
5897 i_size_write(inode, start + len); 5986 i_size_write(inode, start + len);
5898 } 5987 }
5899 5988
5989 /*
5990 * In the case of write we need to clear and unlock the entire range,
5991 * in the case of read we need to unlock only the end area that we
5992 * aren't using if there is any left over space.
5993 */
5994 if (lockstart < lockend)
5995 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5996 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
5997 else
5998 free_extent_state(cached_state);
5999
5900 free_extent_map(em); 6000 free_extent_map(em);
5901 6001
5902 return 0; 6002 return 0;
6003
6004unlock_err:
6005 if (create)
6006 unlock_bits |= EXTENT_DO_ACCOUNTING;
6007
6008 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6009 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
6010 return ret;
5903} 6011}
5904 6012
5905struct btrfs_dio_private { 6013struct btrfs_dio_private {
@@ -6340,132 +6448,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
6340out: 6448out:
6341 return retval; 6449 return retval;
6342} 6450}
6451
6343static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 6452static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6344 const struct iovec *iov, loff_t offset, 6453 const struct iovec *iov, loff_t offset,
6345 unsigned long nr_segs) 6454 unsigned long nr_segs)
6346{ 6455{
6347 struct file *file = iocb->ki_filp; 6456 struct file *file = iocb->ki_filp;
6348 struct inode *inode = file->f_mapping->host; 6457 struct inode *inode = file->f_mapping->host;
6349 struct btrfs_ordered_extent *ordered;
6350 struct extent_state *cached_state = NULL;
6351 u64 lockstart, lockend;
6352 ssize_t ret;
6353 int writing = rw & WRITE;
6354 int write_bits = 0;
6355 size_t count = iov_length(iov, nr_segs);
6356 6458
6357 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6459 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6358 offset, nr_segs)) { 6460 offset, nr_segs))
6359 return 0; 6461 return 0;
6360 }
6361
6362 lockstart = offset;
6363 lockend = offset + count - 1;
6364
6365 if (writing) {
6366 ret = btrfs_delalloc_reserve_space(inode, count);
6367 if (ret)
6368 goto out;
6369 }
6370 6462
6371 while (1) { 6463 return __blockdev_direct_IO(rw, iocb, inode,
6372 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6373 0, &cached_state);
6374 /*
6375 * We're concerned with the entire range that we're going to be
6376 * doing DIO to, so we need to make sure theres no ordered
6377 * extents in this range.
6378 */
6379 ordered = btrfs_lookup_ordered_range(inode, lockstart,
6380 lockend - lockstart + 1);
6381
6382 /*
6383 * We need to make sure there are no buffered pages in this
6384 * range either, we could have raced between the invalidate in
6385 * generic_file_direct_write and locking the extent. The
6386 * invalidate needs to happen so that reads after a write do not
6387 * get stale data.
6388 */
6389 if (!ordered && (!writing ||
6390 !test_range_bit(&BTRFS_I(inode)->io_tree,
6391 lockstart, lockend, EXTENT_UPTODATE, 0,
6392 cached_state)))
6393 break;
6394
6395 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6396 &cached_state, GFP_NOFS);
6397
6398 if (ordered) {
6399 btrfs_start_ordered_extent(inode, ordered, 1);
6400 btrfs_put_ordered_extent(ordered);
6401 } else {
6402 /* Screw you mmap */
6403 ret = filemap_write_and_wait_range(file->f_mapping,
6404 lockstart,
6405 lockend);
6406 if (ret)
6407 goto out;
6408
6409 /*
6410 * If we found a page that couldn't be invalidated just
6411 * fall back to buffered.
6412 */
6413 ret = invalidate_inode_pages2_range(file->f_mapping,
6414 lockstart >> PAGE_CACHE_SHIFT,
6415 lockend >> PAGE_CACHE_SHIFT);
6416 if (ret) {
6417 if (ret == -EBUSY)
6418 ret = 0;
6419 goto out;
6420 }
6421 }
6422
6423 cond_resched();
6424 }
6425
6426 /*
6427 * we don't use btrfs_set_extent_delalloc because we don't want
6428 * the dirty or uptodate bits
6429 */
6430 if (writing) {
6431 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
6432 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6433 EXTENT_DELALLOC, NULL, &cached_state,
6434 GFP_NOFS);
6435 if (ret) {
6436 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6437 lockend, EXTENT_LOCKED | write_bits,
6438 1, 0, &cached_state, GFP_NOFS);
6439 goto out;
6440 }
6441 }
6442
6443 free_extent_state(cached_state);
6444 cached_state = NULL;
6445
6446 ret = __blockdev_direct_IO(rw, iocb, inode,
6447 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6464 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
6448 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6465 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
6449 btrfs_submit_direct, 0); 6466 btrfs_submit_direct, 0);
6450
6451 if (ret < 0 && ret != -EIOCBQUEUED) {
6452 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
6453 offset + iov_length(iov, nr_segs) - 1,
6454 EXTENT_LOCKED | write_bits, 1, 0,
6455 &cached_state, GFP_NOFS);
6456 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
6457 /*
6458 * We're falling back to buffered, unlock the section we didn't
6459 * do IO on.
6460 */
6461 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
6462 offset + iov_length(iov, nr_segs) - 1,
6463 EXTENT_LOCKED | write_bits, 1, 0,
6464 &cached_state, GFP_NOFS);
6465 }
6466out:
6467 free_extent_state(cached_state);
6468 return ret;
6469} 6467}
6470 6468
6471static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6469static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,