aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-05-20 09:01:54 -0400
committerChris Mason <clm@fb.com>2015-06-03 07:03:04 -0400
commit4fbcdf6694544fd9d2aedbc1e73e52b90a4fcc20 (patch)
tree2d5979c0e29cc6f21a04fa521b261bfb75b04168 /fs
parent0d2b2372e097cd3b4150d3ec91e79ac3c5cc750e (diff)
Btrfs: fix -ENOSPC when finishing block group creation
While creating a block group, we often end up getting ENOSPC while updating the chunk tree, which leads to a transaction abortion that produces a trace like the following: [30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]() [30670.117777] BTRFS: Transaction aborted (error -28) (...) [30670.163567] Call Trace: [30670.163906] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b [30670.164522] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad [30670.165171] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb [30670.166323] [<ffffffffa035daa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs] [30670.167213] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48 [30670.167862] [<ffffffffa035daa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs] [30670.169116] [<ffffffffa03743d7>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs] [30670.170593] [<ffffffffa038426a>] __btrfs_end_transaction+0x84/0x366 [btrfs] [30670.171960] [<ffffffffa038455c>] btrfs_end_transaction+0x10/0x12 [btrfs] [30670.174649] [<ffffffffa036eb6b>] btrfs_check_data_free_space+0x11f/0x27c [btrfs] [30670.176092] [<ffffffffa039450d>] btrfs_fallocate+0x7c8/0xb96 [btrfs] [30670.177218] [<ffffffff812459f2>] ? __this_cpu_preempt_check+0x13/0x15 [30670.178622] [<ffffffff81152447>] vfs_fallocate+0x14c/0x1de [30670.179642] [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f [30670.180692] [<ffffffff81152863>] SyS_fallocate+0x47/0x62 [30670.186737] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17 [30670.187792] ---[ end trace 0373e6b491c4a8cc ]--- This is because we don't do proper space reservation for the chunk block reserve when we have multiple tasks allocating chunks in parallel. So block group creation has 2 phases, and the first phase essentially checks if there is enough space in the system space_info, allocating a new system chunk if there isn't, while the second phase updates the device, extent and chunk trees. However, because the updates to the chunk tree happen in the second phase, if we have N tasks, each with its own transaction handle, allocating new chunks in parallel and if there is only enough space in the system space_info to allocate M chunks, where M < N, none of the tasks ends up allocating a new system chunk in the first phase and N - M tasks will get -ENOSPC when attempting to update the chunk tree in phase 2 if they need to COW any nodes/leafs from the chunk tree. Fix this by doing proper reservation in the chunk block reserve. The issue could be reproduced by running fstests generic/038 in a loop, which eventually triggered the problem. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/extent-tree.c44
-rw-r--r--fs/btrfs/transaction.c6
-rw-r--r--fs/btrfs/transaction.h1
4 files changed, 50 insertions, 2 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9d7a6c38f0b1..8ee5645ef9e1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3458,6 +3458,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
3458void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3458void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
3459void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3459void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3460 struct btrfs_root *root); 3460 struct btrfs_root *root);
3461void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
3461int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 3462int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3462 struct inode *inode); 3463 struct inode *inode);
3463void btrfs_orphan_release_metadata(struct inode *inode); 3464void btrfs_orphan_release_metadata(struct inode *inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1cbc71d8cb96..4e08e47ace30 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4116,11 +4116,19 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
4116 struct btrfs_space_info *info; 4116 struct btrfs_space_info *info;
4117 u64 left; 4117 u64 left;
4118 u64 thresh; 4118 u64 thresh;
4119 int ret = 0;
4120
4121 /*
4122 * Needed because we can end up allocating a system chunk and for an
4123 * atomic and race free space reservation in the chunk block reserve.
4124 */
4125 ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4119 4126
4120 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4127 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4121 spin_lock(&info->lock); 4128 spin_lock(&info->lock);
4122 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 4129 left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4123 info->bytes_reserved - info->bytes_readonly; 4130 info->bytes_reserved - info->bytes_readonly -
4131 info->bytes_may_use;
4124 spin_unlock(&info->lock); 4132 spin_unlock(&info->lock);
4125 4133
4126 thresh = get_system_chunk_thresh(root, type); 4134 thresh = get_system_chunk_thresh(root, type);
@@ -4134,7 +4142,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
4134 u64 flags; 4142 u64 flags;
4135 4143
4136 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 4144 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4137 btrfs_alloc_chunk(trans, root, flags); 4145 /*
4146 * Ignore failure to create system chunk. We might end up not
4147 * needing it, as we might not need to COW all nodes/leafs from
4148 * the paths we visit in the chunk tree (they were already COWed
4149 * or created in the current transaction for example).
4150 */
4151 ret = btrfs_alloc_chunk(trans, root, flags);
4152 }
4153
4154 if (!ret) {
4155 ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4156 &root->fs_info->chunk_block_rsv,
4157 thresh, BTRFS_RESERVE_NO_FLUSH);
4158 if (!ret)
4159 trans->chunk_bytes_reserved += thresh;
4138 } 4160 }
4139} 4161}
4140 4162
@@ -5192,6 +5214,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5192 trans->bytes_reserved = 0; 5214 trans->bytes_reserved = 0;
5193} 5215}
5194 5216
5217/*
5218 * To be called after all the new block groups attached to the transaction
5219 * handle have been created (btrfs_create_pending_block_groups()).
5220 */
5221void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5222{
5223 struct btrfs_fs_info *fs_info = trans->root->fs_info;
5224
5225 if (!trans->chunk_bytes_reserved)
5226 return;
5227
5228 WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5229
5230 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5231 trans->chunk_bytes_reserved);
5232 trans->chunk_bytes_reserved = 0;
5233}
5234
5195/* Can only return 0 or -ENOSPC */ 5235/* Can only return 0 or -ENOSPC */
5196int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 5236int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5197 struct inode *inode) 5237 struct inode *inode)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5628e25250c0..03a3ec7e31ea 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -509,6 +509,7 @@ again:
509 h->transaction = cur_trans; 509 h->transaction = cur_trans;
510 h->blocks_used = 0; 510 h->blocks_used = 0;
511 h->bytes_reserved = 0; 511 h->bytes_reserved = 0;
512 h->chunk_bytes_reserved = 0;
512 h->root = root; 513 h->root = root;
513 h->delayed_ref_updates = 0; 514 h->delayed_ref_updates = 0;
514 h->use_count = 1; 515 h->use_count = 1;
@@ -792,6 +793,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
792 if (!list_empty(&trans->new_bgs)) 793 if (!list_empty(&trans->new_bgs))
793 btrfs_create_pending_block_groups(trans, root); 794 btrfs_create_pending_block_groups(trans, root);
794 795
796 btrfs_trans_release_chunk_metadata(trans);
797
795 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 798 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
796 should_end_transaction(trans, root) && 799 should_end_transaction(trans, root) &&
797 ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { 800 ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -2054,6 +2057,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
2054 clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); 2057 clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
2055 clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); 2058 clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
2056 2059
2060 btrfs_trans_release_chunk_metadata(trans);
2061
2057 spin_lock(&root->fs_info->trans_lock); 2062 spin_lock(&root->fs_info->trans_lock);
2058 cur_trans->state = TRANS_STATE_UNBLOCKED; 2063 cur_trans->state = TRANS_STATE_UNBLOCKED;
2059 root->fs_info->running_transaction = NULL; 2064 root->fs_info->running_transaction = NULL;
@@ -2123,6 +2128,7 @@ scrub_continue:
2123 btrfs_scrub_continue(root); 2128 btrfs_scrub_continue(root);
2124cleanup_transaction: 2129cleanup_transaction:
2125 btrfs_trans_release_metadata(trans, root); 2130 btrfs_trans_release_metadata(trans, root);
2131 btrfs_trans_release_chunk_metadata(trans);
2126 trans->block_rsv = NULL; 2132 trans->block_rsv = NULL;
2127 if (trans->qgroup_reserved) { 2133 if (trans->qgroup_reserved) {
2128 btrfs_qgroup_free(root, trans->qgroup_reserved); 2134 btrfs_qgroup_free(root, trans->qgroup_reserved);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0b24755596ba..036fa83d6ccb 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -102,6 +102,7 @@ struct btrfs_transaction {
102struct btrfs_trans_handle { 102struct btrfs_trans_handle {
103 u64 transid; 103 u64 transid;
104 u64 bytes_reserved; 104 u64 bytes_reserved;
105 u64 chunk_bytes_reserved;
105 u64 qgroup_reserved; 106 u64 qgroup_reserved;
106 unsigned long use_count; 107 unsigned long use_count;
107 unsigned long blocks_reserved; 108 unsigned long blocks_reserved;