diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-05-16 18:50:58 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-05-16 18:50:58 -0400 |
commit | c7309e88a694acbe9e42655f02b9dd37c7931424 (patch) | |
tree | d4d6e55e33bf34f7759c8be1cc52b938aff68813 /fs/btrfs | |
parent | 518af3cb8ccaf32057db6046e241ec393d6c7b98 (diff) | |
parent | 062c19e9dd692b8a78e3532f71c290520a2ab437 (diff) |
Merge branch 'for-linus-4.1' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs fixes from Chris Mason:
"The first commit is a fix from Filipe for a very old extent buffer
reuse race that triggered a BUG_ON. It hasn't come up often, I looked
through old logs at FB and we hit it a handful of times over the last
year.
The rest are other corners he hit during testing"
* 'for-linus-4.1' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs:
Btrfs: fix race when reusing stale extent buffers that leads to BUG_ON
Btrfs: fix race between block group creation and their cache writeout
Btrfs: fix panic when starting bg cache writeout after IO error
Btrfs: fix crash after inode cache writeback failure
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/extent-tree.c | 31 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 19 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 14 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.c | 14 |
4 files changed, 68 insertions, 10 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0ec8e228b89f..7effed6f2fa6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -3180,8 +3180,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, | |||
3180 | btrfs_mark_buffer_dirty(leaf); | 3180 | btrfs_mark_buffer_dirty(leaf); |
3181 | fail: | 3181 | fail: |
3182 | btrfs_release_path(path); | 3182 | btrfs_release_path(path); |
3183 | if (ret) | ||
3184 | btrfs_abort_transaction(trans, root, ret); | ||
3185 | return ret; | 3183 | return ret; |
3186 | 3184 | ||
3187 | } | 3185 | } |
@@ -3487,8 +3485,30 @@ again: | |||
3487 | ret = 0; | 3485 | ret = 0; |
3488 | } | 3486 | } |
3489 | } | 3487 | } |
3490 | if (!ret) | 3488 | if (!ret) { |
3491 | ret = write_one_cache_group(trans, root, path, cache); | 3489 | ret = write_one_cache_group(trans, root, path, cache); |
3490 | /* | ||
3491 | * Our block group might still be attached to the list | ||
3492 | * of new block groups in the transaction handle of some | ||
3493 | * other task (struct btrfs_trans_handle->new_bgs). This | ||
3494 | * means its block group item isn't yet in the extent | ||
3495 | * tree. If this happens ignore the error, as we will | ||
3496 | * try again later in the critical section of the | ||
3497 | * transaction commit. | ||
3498 | */ | ||
3499 | if (ret == -ENOENT) { | ||
3500 | ret = 0; | ||
3501 | spin_lock(&cur_trans->dirty_bgs_lock); | ||
3502 | if (list_empty(&cache->dirty_list)) { | ||
3503 | list_add_tail(&cache->dirty_list, | ||
3504 | &cur_trans->dirty_bgs); | ||
3505 | btrfs_get_block_group(cache); | ||
3506 | } | ||
3507 | spin_unlock(&cur_trans->dirty_bgs_lock); | ||
3508 | } else if (ret) { | ||
3509 | btrfs_abort_transaction(trans, root, ret); | ||
3510 | } | ||
3511 | } | ||
3492 | 3512 | ||
3493 | /* if its not on the io list, we need to put the block group */ | 3513 | /* if its not on the io list, we need to put the block group */ |
3494 | if (should_put) | 3514 | if (should_put) |
@@ -3597,8 +3617,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, | |||
3597 | ret = 0; | 3617 | ret = 0; |
3598 | } | 3618 | } |
3599 | } | 3619 | } |
3600 | if (!ret) | 3620 | if (!ret) { |
3601 | ret = write_one_cache_group(trans, root, path, cache); | 3621 | ret = write_one_cache_group(trans, root, path, cache); |
3622 | if (ret) | ||
3623 | btrfs_abort_transaction(trans, root, ret); | ||
3624 | } | ||
3602 | 3625 | ||
3603 | /* if its not on the io list, we need to put the block group */ | 3626 | /* if its not on the io list, we need to put the block group */ |
3604 | if (should_put) | 3627 | if (should_put) |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 43af5a61ad25..c32d226bfecc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -4772,6 +4772,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, | |||
4772 | start >> PAGE_CACHE_SHIFT); | 4772 | start >> PAGE_CACHE_SHIFT); |
4773 | if (eb && atomic_inc_not_zero(&eb->refs)) { | 4773 | if (eb && atomic_inc_not_zero(&eb->refs)) { |
4774 | rcu_read_unlock(); | 4774 | rcu_read_unlock(); |
4775 | /* | ||
4776 | * Lock our eb's refs_lock to avoid races with | ||
4777 | * free_extent_buffer. When we get our eb it might be flagged | ||
4778 | * with EXTENT_BUFFER_STALE and another task running | ||
4779 | * free_extent_buffer might have seen that flag set, | ||
4780 | * eb->refs == 2, that the buffer isn't under IO (dirty and | ||
4781 | * writeback flags not set) and it's still in the tree (flag | ||
4782 | * EXTENT_BUFFER_TREE_REF set), therefore being in the process | ||
4783 | * of decrementing the extent buffer's reference count twice. | ||
4784 | * So here we could race and increment the eb's reference count, | ||
4785 | * clear its stale flag, mark it as dirty and drop our reference | ||
4786 | * before the other task finishes executing free_extent_buffer, | ||
4787 | * which would later result in an attempt to free an extent | ||
4788 | * buffer that is dirty. | ||
4789 | */ | ||
4790 | if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { | ||
4791 | spin_lock(&eb->refs_lock); | ||
4792 | spin_unlock(&eb->refs_lock); | ||
4793 | } | ||
4775 | mark_extent_buffer_accessed(eb, NULL); | 4794 | mark_extent_buffer_accessed(eb, NULL); |
4776 | return eb; | 4795 | return eb; |
4777 | } | 4796 | } |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5e020d76fd07..9dbe5b548fa6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -3466,6 +3466,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, | |||
3466 | struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; | 3466 | struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; |
3467 | int ret; | 3467 | int ret; |
3468 | struct btrfs_io_ctl io_ctl; | 3468 | struct btrfs_io_ctl io_ctl; |
3469 | bool release_metadata = true; | ||
3469 | 3470 | ||
3470 | if (!btrfs_test_opt(root, INODE_MAP_CACHE)) | 3471 | if (!btrfs_test_opt(root, INODE_MAP_CACHE)) |
3471 | return 0; | 3472 | return 0; |
@@ -3473,11 +3474,20 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, | |||
3473 | memset(&io_ctl, 0, sizeof(io_ctl)); | 3474 | memset(&io_ctl, 0, sizeof(io_ctl)); |
3474 | ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, | 3475 | ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, |
3475 | trans, path, 0); | 3476 | trans, path, 0); |
3476 | if (!ret) | 3477 | if (!ret) { |
3478 | /* | ||
3479 | * At this point writepages() didn't error out, so our metadata | ||
3480 | * reservation is released when the writeback finishes, at | ||
3481 | * inode.c:btrfs_finish_ordered_io(), regardless of it finishing | ||
3482 | * with or without an error. | ||
3483 | */ | ||
3484 | release_metadata = false; | ||
3477 | ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); | 3485 | ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); |
3486 | } | ||
3478 | 3487 | ||
3479 | if (ret) { | 3488 | if (ret) { |
3480 | btrfs_delalloc_release_metadata(inode, inode->i_size); | 3489 | if (release_metadata) |
3490 | btrfs_delalloc_release_metadata(inode, inode->i_size); | ||
3481 | #ifdef DEBUG | 3491 | #ifdef DEBUG |
3482 | btrfs_err(root->fs_info, | 3492 | btrfs_err(root->fs_info, |
3483 | "failed to write free ino cache for root %llu", | 3493 | "failed to write free ino cache for root %llu", |
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 157cc54fc634..760c4a5e096b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c | |||
@@ -722,6 +722,7 @@ void btrfs_start_ordered_extent(struct inode *inode, | |||
722 | int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) | 722 | int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) |
723 | { | 723 | { |
724 | int ret = 0; | 724 | int ret = 0; |
725 | int ret_wb = 0; | ||
725 | u64 end; | 726 | u64 end; |
726 | u64 orig_end; | 727 | u64 orig_end; |
727 | struct btrfs_ordered_extent *ordered; | 728 | struct btrfs_ordered_extent *ordered; |
@@ -741,9 +742,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) | |||
741 | if (ret) | 742 | if (ret) |
742 | return ret; | 743 | return ret; |
743 | 744 | ||
744 | ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); | 745 | /* |
745 | if (ret) | 746 | * If we have a writeback error don't return immediately. Wait first |
746 | return ret; | 747 | * for any ordered extents that haven't completed yet. This is to make |
748 | * sure no one can dirty the same page ranges and call writepages() | ||
749 | * before the ordered extents complete - to avoid failures (-EEXIST) | ||
750 | * when adding the new ordered extents to the ordered tree. | ||
751 | */ | ||
752 | ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); | ||
747 | 753 | ||
748 | end = orig_end; | 754 | end = orig_end; |
749 | while (1) { | 755 | while (1) { |
@@ -767,7 +773,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) | |||
767 | break; | 773 | break; |
768 | end--; | 774 | end--; |
769 | } | 775 | } |
770 | return ret; | 776 | return ret_wb ? ret_wb : ret; |
771 | } | 777 | } |
772 | 778 | ||
773 | /* | 779 | /* |