diff options
author | chandan <chandan@linux.vnet.ibm.com> | 2015-08-28 11:40:13 -0400 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2015-09-21 16:47:55 -0400 |
commit | 50745b0a7f46f68574cd2b9ae24566bf026e7ebd (patch) | |
tree | 1484ef7e965b7940bf91996133dcd811fb3c1db0 /fs | |
parent | a30e577c96f59b1e1678ea5462432b09bf7d5cbc (diff) |
Btrfs: Direct I/O: Fix space accounting
The following call trace is seen when generic/095 test is executed,
WARNING: CPU: 3 PID: 2769 at /home/chandan/code/repos/linux/fs/btrfs/inode.c:8967 btrfs_destroy_inode+0x284/0x2a0()
Modules linked in:
CPU: 3 PID: 2769 Comm: umount Not tainted 4.2.0-rc5+ #31
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20150306_163512-brownie 04/01/2014
ffffffff81c08150 ffff8802ec9cbce8 ffffffff81984058 ffff8802ffd8feb0
0000000000000000 ffff8802ec9cbd28 ffffffff81050385 ffff8802ec9cbd38
ffff8802d12f8588 ffff8802d12f8588 ffff8802f15ab000 ffff8800bb96c0b0
Call Trace:
[<ffffffff81984058>] dump_stack+0x45/0x57
[<ffffffff81050385>] warn_slowpath_common+0x85/0xc0
[<ffffffff81050465>] warn_slowpath_null+0x15/0x20
[<ffffffff81340294>] btrfs_destroy_inode+0x284/0x2a0
[<ffffffff8117ce07>] destroy_inode+0x37/0x60
[<ffffffff8117cf39>] evict+0x109/0x170
[<ffffffff8117cfd5>] dispose_list+0x35/0x50
[<ffffffff8117dd3a>] evict_inodes+0xaa/0x100
[<ffffffff81165667>] generic_shutdown_super+0x47/0xf0
[<ffffffff81165951>] kill_anon_super+0x11/0x20
[<ffffffff81302093>] btrfs_kill_super+0x13/0x110
[<ffffffff81165c99>] deactivate_locked_super+0x39/0x70
[<ffffffff811660cf>] deactivate_super+0x5f/0x70
[<ffffffff81180e1e>] cleanup_mnt+0x3e/0x90
[<ffffffff81180ebd>] __cleanup_mnt+0xd/0x10
[<ffffffff81069c06>] task_work_run+0x96/0xb0
[<ffffffff81003a3d>] do_notify_resume+0x3d/0x50
[<ffffffff8198cbc2>] int_signal+0x12/0x17
This means that the inode had non-zero "outstanding extents" during
eviction. This occurs because, during direct I/O a task which successfully
used up its reserved data space would set BTRFS_INODE_DIO_READY bit and does
not clear the bit after finishing the DIO write. A future DIO write could
actually fail and the unused reserve space won't be freed because of the
previously set BTRFS_INODE_DIO_READY bit.
Clearing the BTRFS_INODE_DIO_READY bit in btrfs_direct_IO() caused the
following issue,
|-----------------------------------+-------------------------------------|
| Task A | Task B |
|-----------------------------------+-------------------------------------|
| Start direct i/o write on inode X.| |
| reserve space | |
| Allocate ordered extent | |
| release reserved space | |
| Set BTRFS_INODE_DIO_READY bit. | |
| | splice() |
| | Transfer data from pipe buffer to |
| | destination file. |
| | - kmap(pipe buffer page) |
| | - Start direct i/o write on |
| | inode X. |
| | - reserve space |
| | - dio_refill_pages() |
| | - sdio->blocks_available == 0 |
| | - Since a kernel address is |
| | being passed instead of a |
| | user space address, |
| | iov_iter_get_pages() returns |
| | -EFAULT. |
| | - Since BTRFS_INODE_DIO_READY is |
| | set, we don't release reserved |
| | space. |
| | - Clear BTRFS_INODE_DIO_READY bit.|
| -EIOCBQUEUED is returned. | |
|-----------------------------------+-------------------------------------|
Hence this commit introduces "struct btrfs_dio_data" to track the usage of
reserved data space. The remaining unused "reserve space" can now be freed
reliably.
Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 2 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 42 |
2 files changed, 21 insertions, 23 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 81220b2203c6..0ef5cc13fae2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h | |||
@@ -44,8 +44,6 @@ | |||
44 | #define BTRFS_INODE_IN_DELALLOC_LIST 9 | 44 | #define BTRFS_INODE_IN_DELALLOC_LIST 9 |
45 | #define BTRFS_INODE_READDIO_NEED_LOCK 10 | 45 | #define BTRFS_INODE_READDIO_NEED_LOCK 10 |
46 | #define BTRFS_INODE_HAS_PROPS 11 | 46 | #define BTRFS_INODE_HAS_PROPS 11 |
47 | /* DIO is ready to submit */ | ||
48 | #define BTRFS_INODE_DIO_READY 12 | ||
49 | /* | 47 | /* |
50 | * The following 3 bits are meant only for the btree inode. | 48 | * The following 3 bits are meant only for the btree inode. |
51 | * When any of them is set, it means an error happened while writing an | 49 | * When any of them is set, it means an error happened while writing an |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d983dea57af..b7e439bf5e4f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -7405,6 +7405,10 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, | |||
7405 | return em; | 7405 | return em; |
7406 | } | 7406 | } |
7407 | 7407 | ||
7408 | struct btrfs_dio_data { | ||
7409 | u64 outstanding_extents; | ||
7410 | u64 reserve; | ||
7411 | }; | ||
7408 | 7412 | ||
7409 | static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | 7413 | static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, |
7410 | struct buffer_head *bh_result, int create) | 7414 | struct buffer_head *bh_result, int create) |
@@ -7412,10 +7416,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | |||
7412 | struct extent_map *em; | 7416 | struct extent_map *em; |
7413 | struct btrfs_root *root = BTRFS_I(inode)->root; | 7417 | struct btrfs_root *root = BTRFS_I(inode)->root; |
7414 | struct extent_state *cached_state = NULL; | 7418 | struct extent_state *cached_state = NULL; |
7419 | struct btrfs_dio_data *dio_data = NULL; | ||
7415 | u64 start = iblock << inode->i_blkbits; | 7420 | u64 start = iblock << inode->i_blkbits; |
7416 | u64 lockstart, lockend; | 7421 | u64 lockstart, lockend; |
7417 | u64 len = bh_result->b_size; | 7422 | u64 len = bh_result->b_size; |
7418 | u64 *outstanding_extents = NULL; | ||
7419 | int unlock_bits = EXTENT_LOCKED; | 7423 | int unlock_bits = EXTENT_LOCKED; |
7420 | int ret = 0; | 7424 | int ret = 0; |
7421 | 7425 | ||
@@ -7433,7 +7437,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | |||
7433 | * that anything that needs to check if there's a transction doesn't get | 7437 | * that anything that needs to check if there's a transction doesn't get |
7434 | * confused. | 7438 | * confused. |
7435 | */ | 7439 | */ |
7436 | outstanding_extents = current->journal_info; | 7440 | dio_data = current->journal_info; |
7437 | current->journal_info = NULL; | 7441 | current->journal_info = NULL; |
7438 | } | 7442 | } |
7439 | 7443 | ||
@@ -7565,17 +7569,18 @@ unlock: | |||
7565 | * within our reservation, otherwise we need to adjust our inode | 7569 | * within our reservation, otherwise we need to adjust our inode |
7566 | * counter appropriately. | 7570 | * counter appropriately. |
7567 | */ | 7571 | */ |
7568 | if (*outstanding_extents) { | 7572 | if (dio_data->outstanding_extents) { |
7569 | (*outstanding_extents)--; | 7573 | (dio_data->outstanding_extents)--; |
7570 | } else { | 7574 | } else { |
7571 | spin_lock(&BTRFS_I(inode)->lock); | 7575 | spin_lock(&BTRFS_I(inode)->lock); |
7572 | BTRFS_I(inode)->outstanding_extents++; | 7576 | BTRFS_I(inode)->outstanding_extents++; |
7573 | spin_unlock(&BTRFS_I(inode)->lock); | 7577 | spin_unlock(&BTRFS_I(inode)->lock); |
7574 | } | 7578 | } |
7575 | 7579 | ||
7576 | current->journal_info = outstanding_extents; | ||
7577 | btrfs_free_reserved_data_space(inode, len); | 7580 | btrfs_free_reserved_data_space(inode, len); |
7578 | set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); | 7581 | WARN_ON(dio_data->reserve < len); |
7582 | dio_data->reserve -= len; | ||
7583 | current->journal_info = dio_data; | ||
7579 | } | 7584 | } |
7580 | 7585 | ||
7581 | /* | 7586 | /* |
@@ -7598,8 +7603,8 @@ unlock: | |||
7598 | unlock_err: | 7603 | unlock_err: |
7599 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, | 7604 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, |
7600 | unlock_bits, 1, 0, &cached_state, GFP_NOFS); | 7605 | unlock_bits, 1, 0, &cached_state, GFP_NOFS); |
7601 | if (outstanding_extents) | 7606 | if (dio_data) |
7602 | current->journal_info = outstanding_extents; | 7607 | current->journal_info = dio_data; |
7603 | return ret; | 7608 | return ret; |
7604 | } | 7609 | } |
7605 | 7610 | ||
@@ -8329,7 +8334,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
8329 | { | 8334 | { |
8330 | struct file *file = iocb->ki_filp; | 8335 | struct file *file = iocb->ki_filp; |
8331 | struct inode *inode = file->f_mapping->host; | 8336 | struct inode *inode = file->f_mapping->host; |
8332 | u64 outstanding_extents = 0; | 8337 | struct btrfs_root *root = BTRFS_I(inode)->root; |
8338 | struct btrfs_dio_data dio_data = { 0 }; | ||
8333 | size_t count = 0; | 8339 | size_t count = 0; |
8334 | int flags = 0; | 8340 | int flags = 0; |
8335 | bool wakeup = true; | 8341 | bool wakeup = true; |
@@ -8367,7 +8373,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
8367 | ret = btrfs_delalloc_reserve_space(inode, count); | 8373 | ret = btrfs_delalloc_reserve_space(inode, count); |
8368 | if (ret) | 8374 | if (ret) |
8369 | goto out; | 8375 | goto out; |
8370 | outstanding_extents = div64_u64(count + | 8376 | dio_data.outstanding_extents = div64_u64(count + |
8371 | BTRFS_MAX_EXTENT_SIZE - 1, | 8377 | BTRFS_MAX_EXTENT_SIZE - 1, |
8372 | BTRFS_MAX_EXTENT_SIZE); | 8378 | BTRFS_MAX_EXTENT_SIZE); |
8373 | 8379 | ||
@@ -8376,7 +8382,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
8376 | * do the accounting properly if we go over the number we | 8382 | * do the accounting properly if we go over the number we |
8377 | * originally calculated. Abuse current->journal_info for this. | 8383 | * originally calculated. Abuse current->journal_info for this. |
8378 | */ | 8384 | */ |
8379 | current->journal_info = &outstanding_extents; | 8385 | dio_data.reserve = round_up(count, root->sectorsize); |
8386 | current->journal_info = &dio_data; | ||
8380 | } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, | 8387 | } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, |
8381 | &BTRFS_I(inode)->runtime_flags)) { | 8388 | &BTRFS_I(inode)->runtime_flags)) { |
8382 | inode_dio_end(inode); | 8389 | inode_dio_end(inode); |
@@ -8391,16 +8398,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
8391 | if (iov_iter_rw(iter) == WRITE) { | 8398 | if (iov_iter_rw(iter) == WRITE) { |
8392 | current->journal_info = NULL; | 8399 | current->journal_info = NULL; |
8393 | if (ret < 0 && ret != -EIOCBQUEUED) { | 8400 | if (ret < 0 && ret != -EIOCBQUEUED) { |
8394 | /* | 8401 | if (dio_data.reserve) |
8395 | * If the error comes from submitting stage, | 8402 | btrfs_delalloc_release_space(inode, |
8396 | * btrfs_get_blocsk_direct() has free'd data space, | 8403 | dio_data.reserve); |
8397 | * and metadata space will be handled by | ||
8398 | * finish_ordered_fn, don't do that again to make | ||
8399 | * sure bytes_may_use is correct. | ||
8400 | */ | ||
8401 | if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, | ||
8402 | &BTRFS_I(inode)->runtime_flags)) | ||
8403 | btrfs_delalloc_release_space(inode, count); | ||
8404 | } else if (ret >= 0 && (size_t)ret < count) | 8404 | } else if (ret >= 0 && (size_t)ret < count) |
8405 | btrfs_delalloc_release_space(inode, | 8405 | btrfs_delalloc_release_space(inode, |
8406 | count - (size_t)ret); | 8406 | count - (size_t)ret); |