aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <clm@fb.com>2015-12-23 16:28:35 -0500
committerChris Mason <clm@fb.com>2015-12-23 16:28:35 -0500
commita53fe2576955171449711933242d8fb1c13a7d5c (patch)
tree6fd4a583f2e9606c6c61226b40a060e4c1b2dc60
parentbb9d687618695e8291f1e6209eb3211d231f97bb (diff)
parente44081ef611832b47a86abf4e36dc0ed2e950884 (diff)
Merge branch 'for-chris-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux into for-linus-4.5
-rw-r--r--fs/btrfs/extent-tree.c19
-rw-r--r--fs/btrfs/inode.c127
-rw-r--r--fs/btrfs/transaction.c17
-rw-r--r--fs/btrfs/tree-defrag.c27
-rw-r--r--fs/btrfs/volumes.c17
5 files changed, 151 insertions, 56 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 065055342881..d436117e20dd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3684,11 +3684,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3684 return -ENOMEM; 3684 return -ENOMEM;
3685 3685
3686 /* 3686 /*
3687 * We don't need the lock here since we are protected by the transaction 3687 * Even though we are in the critical section of the transaction commit,
3688 * commit. We want to do the cache_save_setup first and then run the 3688 * we can still have concurrent tasks adding elements to this
3689 * transaction's list of dirty block groups. These tasks correspond to
3690 * endio free space workers started when writeback finishes for a
3691 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3692 * allocate new block groups as a result of COWing nodes of the root
3693 * tree when updating the free space inode. The writeback for the space
3694 * caches is triggered by an earlier call to
3695 * btrfs_start_dirty_block_groups() and iterations of the following
3696 * loop.
3697 * Also we want to do the cache_save_setup first and then run the
3689 * delayed refs to make sure we have the best chance at doing this all 3698 * delayed refs to make sure we have the best chance at doing this all
3690 * in one shot. 3699 * in one shot.
3691 */ 3700 */
3701 spin_lock(&cur_trans->dirty_bgs_lock);
3692 while (!list_empty(&cur_trans->dirty_bgs)) { 3702 while (!list_empty(&cur_trans->dirty_bgs)) {
3693 cache = list_first_entry(&cur_trans->dirty_bgs, 3703 cache = list_first_entry(&cur_trans->dirty_bgs,
3694 struct btrfs_block_group_cache, 3704 struct btrfs_block_group_cache,
@@ -3700,11 +3710,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3700 * finish and then do it all again 3710 * finish and then do it all again
3701 */ 3711 */
3702 if (!list_empty(&cache->io_list)) { 3712 if (!list_empty(&cache->io_list)) {
3713 spin_unlock(&cur_trans->dirty_bgs_lock);
3703 list_del_init(&cache->io_list); 3714 list_del_init(&cache->io_list);
3704 btrfs_wait_cache_io(root, trans, cache, 3715 btrfs_wait_cache_io(root, trans, cache,
3705 &cache->io_ctl, path, 3716 &cache->io_ctl, path,
3706 cache->key.objectid); 3717 cache->key.objectid);
3707 btrfs_put_block_group(cache); 3718 btrfs_put_block_group(cache);
3719 spin_lock(&cur_trans->dirty_bgs_lock);
3708 } 3720 }
3709 3721
3710 /* 3722 /*
@@ -3712,6 +3724,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3712 * on any pending IO 3724 * on any pending IO
3713 */ 3725 */
3714 list_del_init(&cache->dirty_list); 3726 list_del_init(&cache->dirty_list);
3727 spin_unlock(&cur_trans->dirty_bgs_lock);
3715 should_put = 1; 3728 should_put = 1;
3716 3729
3717 cache_save_setup(cache, trans, path); 3730 cache_save_setup(cache, trans, path);
@@ -3743,7 +3756,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3743 /* if its not on the io list, we need to put the block group */ 3756 /* if its not on the io list, we need to put the block group */
3744 if (should_put) 3757 if (should_put)
3745 btrfs_put_block_group(cache); 3758 btrfs_put_block_group(cache);
3759 spin_lock(&cur_trans->dirty_bgs_lock);
3746 } 3760 }
3761 spin_unlock(&cur_trans->dirty_bgs_lock);
3747 3762
3748 while (!list_empty(io)) { 3763 while (!list_empty(io)) {
3749 cache = list_first_entry(io, struct btrfs_block_group_cache, 3764 cache = list_first_entry(io, struct btrfs_block_group_cache,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dc616329c9a1..bdb0008712c8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
66 struct btrfs_root *root; 66 struct btrfs_root *root;
67}; 67};
68 68
69struct btrfs_dio_data {
70 u64 outstanding_extents;
71 u64 reserve;
72 u64 unsubmitted_oe_range_start;
73 u64 unsubmitted_oe_range_end;
74};
75
69static const struct inode_operations btrfs_dir_inode_operations; 76static const struct inode_operations btrfs_dir_inode_operations;
70static const struct inode_operations btrfs_symlink_inode_operations; 77static const struct inode_operations btrfs_symlink_inode_operations;
71static const struct inode_operations btrfs_dir_ro_inode_operations; 78static const struct inode_operations btrfs_dir_ro_inode_operations;
@@ -7408,25 +7415,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7408 btrfs_start_ordered_extent(inode, ordered, 1); 7415 btrfs_start_ordered_extent(inode, ordered, 1);
7409 btrfs_put_ordered_extent(ordered); 7416 btrfs_put_ordered_extent(ordered);
7410 } else { 7417 } else {
7411 /* Screw you mmap */
7412 ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
7413 if (ret)
7414 break;
7415 ret = filemap_fdatawait_range(inode->i_mapping,
7416 lockstart,
7417 lockend);
7418 if (ret)
7419 break;
7420
7421 /* 7418 /*
7422 * If we found a page that couldn't be invalidated just 7419 * We could trigger writeback for this range (and wait
7423 * fall back to buffered. 7420 * for it to complete) and then invalidate the pages for
7421 * this range (through invalidate_inode_pages2_range()),
7422 * but that can lead us to a deadlock with a concurrent
7423 * call to readpages() (a buffered read or a defrag call
7424 * triggered a readahead) on a page lock due to an
7425 * ordered dio extent we created before but did not have
7426 * yet a corresponding bio submitted (whence it can not
7427 * complete), which makes readpages() wait for that
7428 * ordered extent to complete while holding a lock on
7429 * that page.
7424 */ 7430 */
7425 ret = invalidate_inode_pages2_range(inode->i_mapping, 7431 ret = -ENOTBLK;
7426 lockstart >> PAGE_CACHE_SHIFT, 7432 break;
7427 lockend >> PAGE_CACHE_SHIFT);
7428 if (ret)
7429 break;
7430 } 7433 }
7431 7434
7432 cond_resched(); 7435 cond_resched();
@@ -7482,11 +7485,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
7482 return em; 7485 return em;
7483} 7486}
7484 7487
7485struct btrfs_dio_data {
7486 u64 outstanding_extents;
7487 u64 reserve;
7488};
7489
7490static void adjust_dio_outstanding_extents(struct inode *inode, 7488static void adjust_dio_outstanding_extents(struct inode *inode,
7491 struct btrfs_dio_data *dio_data, 7489 struct btrfs_dio_data *dio_data,
7492 const u64 len) 7490 const u64 len)
@@ -7670,6 +7668,7 @@ unlock:
7670 btrfs_free_reserved_data_space(inode, start, len); 7668 btrfs_free_reserved_data_space(inode, start, len);
7671 WARN_ON(dio_data->reserve < len); 7669 WARN_ON(dio_data->reserve < len);
7672 dio_data->reserve -= len; 7670 dio_data->reserve -= len;
7671 dio_data->unsubmitted_oe_range_end = start + len;
7673 current->journal_info = dio_data; 7672 current->journal_info = dio_data;
7674 } 7673 }
7675 7674
@@ -7992,22 +7991,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
7992 bio_put(bio); 7991 bio_put(bio);
7993} 7992}
7994 7993
7995static void btrfs_endio_direct_write(struct bio *bio) 7994static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
7995 const u64 offset,
7996 const u64 bytes,
7997 const int uptodate)
7996{ 7998{
7997 struct btrfs_dio_private *dip = bio->bi_private;
7998 struct inode *inode = dip->inode;
7999 struct btrfs_root *root = BTRFS_I(inode)->root; 7999 struct btrfs_root *root = BTRFS_I(inode)->root;
8000 struct btrfs_ordered_extent *ordered = NULL; 8000 struct btrfs_ordered_extent *ordered = NULL;
8001 u64 ordered_offset = dip->logical_offset; 8001 u64 ordered_offset = offset;
8002 u64 ordered_bytes = dip->bytes; 8002 u64 ordered_bytes = bytes;
8003 struct bio *dio_bio;
8004 int ret; 8003 int ret;
8005 8004
8006again: 8005again:
8007 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 8006 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
8008 &ordered_offset, 8007 &ordered_offset,
8009 ordered_bytes, 8008 ordered_bytes,
8010 !bio->bi_error); 8009 uptodate);
8011 if (!ret) 8010 if (!ret)
8012 goto out_test; 8011 goto out_test;
8013 8012
@@ -8020,13 +8019,22 @@ out_test:
8020 * our bio might span multiple ordered extents. If we haven't 8019 * our bio might span multiple ordered extents. If we haven't
8021 * completed the accounting for the whole dio, go back and try again 8020 * completed the accounting for the whole dio, go back and try again
8022 */ 8021 */
8023 if (ordered_offset < dip->logical_offset + dip->bytes) { 8022 if (ordered_offset < offset + bytes) {
8024 ordered_bytes = dip->logical_offset + dip->bytes - 8023 ordered_bytes = offset + bytes - ordered_offset;
8025 ordered_offset;
8026 ordered = NULL; 8024 ordered = NULL;
8027 goto again; 8025 goto again;
8028 } 8026 }
8029 dio_bio = dip->dio_bio; 8027}
8028
8029static void btrfs_endio_direct_write(struct bio *bio)
8030{
8031 struct btrfs_dio_private *dip = bio->bi_private;
8032 struct bio *dio_bio = dip->dio_bio;
8033
8034 btrfs_endio_direct_write_update_ordered(dip->inode,
8035 dip->logical_offset,
8036 dip->bytes,
8037 !bio->bi_error);
8030 8038
8031 kfree(dip); 8039 kfree(dip);
8032 8040
@@ -8334,6 +8342,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
8334 dip->subio_endio = btrfs_subio_endio_read; 8342 dip->subio_endio = btrfs_subio_endio_read;
8335 } 8343 }
8336 8344
8345 /*
8346 * Reset the range for unsubmitted ordered extents (to a 0 length range)
8347 * even if we fail to submit a bio, because in such case we do the
8348 * corresponding error handling below and it must not be done a second
8349 * time by btrfs_direct_IO().
8350 */
8351 if (write) {
8352 struct btrfs_dio_data *dio_data = current->journal_info;
8353
8354 dio_data->unsubmitted_oe_range_end = dip->logical_offset +
8355 dip->bytes;
8356 dio_data->unsubmitted_oe_range_start =
8357 dio_data->unsubmitted_oe_range_end;
8358 }
8359
8337 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 8360 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
8338 if (!ret) 8361 if (!ret)
8339 return; 8362 return;
@@ -8362,24 +8385,15 @@ free_ordered:
8362 dip = NULL; 8385 dip = NULL;
8363 io_bio = NULL; 8386 io_bio = NULL;
8364 } else { 8387 } else {
8365 if (write) { 8388 if (write)
8366 struct btrfs_ordered_extent *ordered; 8389 btrfs_endio_direct_write_update_ordered(inode,
8367 8390 file_offset,
8368 ordered = btrfs_lookup_ordered_extent(inode, 8391 dio_bio->bi_iter.bi_size,
8369 file_offset); 8392 0);
8370 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 8393 else
8371 /*
8372 * Decrements our ref on the ordered extent and removes
8373 * the ordered extent from the inode's ordered tree,
8374 * doing all the proper resource cleanup such as for the
8375 * reserved space and waking up any waiters for this
8376 * ordered extent (through btrfs_remove_ordered_extent).
8377 */
8378 btrfs_finish_ordered_io(ordered);
8379 } else {
8380 unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, 8394 unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8381 file_offset + dio_bio->bi_iter.bi_size - 1); 8395 file_offset + dio_bio->bi_iter.bi_size - 1);
8382 } 8396
8383 dio_bio->bi_error = -EIO; 8397 dio_bio->bi_error = -EIO;
8384 /* 8398 /*
8385 * Releases and cleans up our dio_bio, no need to bio_put() 8399 * Releases and cleans up our dio_bio, no need to bio_put()
@@ -8479,6 +8493,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
8479 * originally calculated. Abuse current->journal_info for this. 8493 * originally calculated. Abuse current->journal_info for this.
8480 */ 8494 */
8481 dio_data.reserve = round_up(count, root->sectorsize); 8495 dio_data.reserve = round_up(count, root->sectorsize);
8496 dio_data.unsubmitted_oe_range_start = (u64)offset;
8497 dio_data.unsubmitted_oe_range_end = (u64)offset;
8482 current->journal_info = &dio_data; 8498 current->journal_info = &dio_data;
8483 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 8499 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8484 &BTRFS_I(inode)->runtime_flags)) { 8500 &BTRFS_I(inode)->runtime_flags)) {
@@ -8497,6 +8513,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
8497 if (dio_data.reserve) 8513 if (dio_data.reserve)
8498 btrfs_delalloc_release_space(inode, offset, 8514 btrfs_delalloc_release_space(inode, offset,
8499 dio_data.reserve); 8515 dio_data.reserve);
8516 /*
8517 * On error we might have left some ordered extents
8518 * without submitting corresponding bios for them, so
8519 * cleanup them up to avoid other tasks getting them
8520 * and waiting for them to complete forever.
8521 */
8522 if (dio_data.unsubmitted_oe_range_start <
8523 dio_data.unsubmitted_oe_range_end)
8524 btrfs_endio_direct_write_update_ordered(inode,
8525 dio_data.unsubmitted_oe_range_start,
8526 dio_data.unsubmitted_oe_range_end -
8527 dio_data.unsubmitted_oe_range_start,
8528 0);
8500 } else if (ret >= 0 && (size_t)ret < count) 8529 } else if (ret >= 0 && (size_t)ret < count)
8501 btrfs_delalloc_release_space(inode, offset, 8530 btrfs_delalloc_release_space(inode, offset,
8502 count - (size_t)ret); 8531 count - (size_t)ret);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index be8eae80ff65..f85ccf634ca1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
75 list_del_init(&em->list); 75 list_del_init(&em->list);
76 free_extent_map(em); 76 free_extent_map(em);
77 } 77 }
78 /*
79 * If any block groups are found in ->deleted_bgs then it's
80 * because the transaction was aborted and a commit did not
81 * happen (things failed before writing the new superblock
82 * and calling btrfs_finish_extent_commit()), so we can not
83 * discard the physical locations of the block groups.
84 */
85 while (!list_empty(&transaction->deleted_bgs)) {
86 struct btrfs_block_group_cache *cache;
87
88 cache = list_first_entry(&transaction->deleted_bgs,
89 struct btrfs_block_group_cache,
90 bg_list);
91 list_del_init(&cache->bg_list);
92 btrfs_put_block_group_trimming(cache);
93 btrfs_put_block_group(cache);
94 }
78 kmem_cache_free(btrfs_transaction_cachep, transaction); 95 kmem_cache_free(btrfs_transaction_cachep, transaction);
79 } 96 }
80} 97}
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f31db4325339..cb65089127cc 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
89 goto out; 89 goto out;
90 } 90 }
91 btrfs_release_path(path); 91 btrfs_release_path(path);
92 /*
93 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
94 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
95 * a deadlock (attempting to write lock an already write locked leaf).
96 */
97 path->lowest_level = 1;
92 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 98 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
93 99
94 if (wret < 0) { 100 if (wret < 0) {
@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
99 ret = 0; 105 ret = 0;
100 goto out; 106 goto out;
101 } 107 }
102 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 108 /*
103 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 109 * The node at level 1 must always be locked when our path has
104 min_trans); 110 * keep_locks set and lowest_level is 1, regardless of the value of
111 * path->slots[1].
112 */
113 BUG_ON(path->locks[1] == 0);
105 ret = btrfs_realloc_node(trans, root, 114 ret = btrfs_realloc_node(trans, root,
106 path->nodes[1], 0, 115 path->nodes[1], 0,
107 &last_ret, 116 &last_ret,
@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
110 WARN_ON(ret == -EAGAIN); 119 WARN_ON(ret == -EAGAIN);
111 goto out; 120 goto out;
112 } 121 }
122 /*
123 * Now that we reallocated the node we can find the next key. Note that
124 * btrfs_find_next_key() can release our path and do another search
125 * without COWing, this is because even with path->keep_locks = 1,
126 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
127 * node when path->slots[node_level - 1] does not point to the last
128 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
129 * we search for the next key after reallocating our node.
130 */
131 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
132 next_key_ret = btrfs_find_next_key(root, path, &key, 1,
133 min_trans);
113 if (next_key_ret == 0) { 134 if (next_key_ret == 0) {
114 memcpy(&root->defrag_progress, &key, sizeof(key)); 135 memcpy(&root->defrag_progress, &key, sizeof(key));
115 ret = -EAGAIN; 136 ret = -EAGAIN;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 54d2d2cc2c92..a37cc0478bb2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4825,20 +4825,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4825 goto out; 4825 goto out;
4826 } 4826 }
4827 4827
4828 /*
4829 * Take the device list mutex to prevent races with the final phase of
4830 * a device replace operation that replaces the device object associated
4831 * with the map's stripes, because the device object's id can change
4832 * at any time during that final phase of the device replace operation
4833 * (dev-replace.c:btrfs_dev_replace_finishing()).
4834 */
4835 mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
4828 for (i = 0; i < map->num_stripes; i++) { 4836 for (i = 0; i < map->num_stripes; i++) {
4829 device = map->stripes[i].dev; 4837 device = map->stripes[i].dev;
4830 dev_offset = map->stripes[i].physical; 4838 dev_offset = map->stripes[i].physical;
4831 4839
4832 ret = btrfs_update_device(trans, device); 4840 ret = btrfs_update_device(trans, device);
4833 if (ret) 4841 if (ret)
4834 goto out; 4842 break;
4835 ret = btrfs_alloc_dev_extent(trans, device, 4843 ret = btrfs_alloc_dev_extent(trans, device,
4836 chunk_root->root_key.objectid, 4844 chunk_root->root_key.objectid,
4837 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4845 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4838 chunk_offset, dev_offset, 4846 chunk_offset, dev_offset,
4839 stripe_size); 4847 stripe_size);
4840 if (ret) 4848 if (ret)
4841 goto out; 4849 break;
4850 }
4851 if (ret) {
4852 mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
4853 goto out;
4842 } 4854 }
4843 4855
4844 stripe = &chunk->stripe; 4856 stripe = &chunk->stripe;
@@ -4851,6 +4863,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4851 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4863 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4852 stripe++; 4864 stripe++;
4853 } 4865 }
4866 mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
4854 4867
4855 btrfs_set_stack_chunk_length(chunk, chunk_size); 4868 btrfs_set_stack_chunk_length(chunk, chunk_size);
4856 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4869 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);