aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fusionio.com>2013-09-12 16:58:28 -0400
committerChris Mason <chris.mason@fusionio.com>2013-09-21 11:05:26 -0400
commit652f25a2921eb32a3e6f88aed3c59b494c1287c4 (patch)
treebb7d2dc93f26e9f8cefe00b590304755a07558eb
parentd555438b6e1dad139d26ab807f0af68a1fa47a86 (diff)
Btrfs: improve replacing nocow extents
Various people have hit a deadlock when running btrfs/011. This is because when replacing nocow extents we will take the i_mutex to make sure nobody messes with the file while we are replacing the extent. The problem is we are already holding a transaction open, which is a locking inversion, so instead we need to save these inodes we find and then process them outside of the transaction. Further we can't just lock the inode and assume we are good to go. We need to lock the extent range and then read back the extent cache for the inode to make sure the extent really still points at the physical block we want. If it doesn't we don't have to copy it. Thanks, Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
-rw-r--r--fs/btrfs/scrub.c112
1 files changed, 98 insertions, 14 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0afcd452fcb3..a18e0e23f6a6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -158,12 +158,20 @@ struct scrub_fixup_nodatasum {
158 int mirror_num; 158 int mirror_num;
159}; 159};
160 160
161struct scrub_nocow_inode {
162 u64 inum;
163 u64 offset;
164 u64 root;
165 struct list_head list;
166};
167
161struct scrub_copy_nocow_ctx { 168struct scrub_copy_nocow_ctx {
162 struct scrub_ctx *sctx; 169 struct scrub_ctx *sctx;
163 u64 logical; 170 u64 logical;
164 u64 len; 171 u64 len;
165 int mirror_num; 172 int mirror_num;
166 u64 physical_for_dev_replace; 173 u64 physical_for_dev_replace;
174 struct list_head inodes;
167 struct btrfs_work work; 175 struct btrfs_work work;
168}; 176};
169 177
@@ -245,7 +253,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
245static int write_page_nocow(struct scrub_ctx *sctx, 253static int write_page_nocow(struct scrub_ctx *sctx,
246 u64 physical_for_dev_replace, struct page *page); 254 u64 physical_for_dev_replace, struct page *page);
247static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 255static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
248 void *ctx); 256 struct scrub_copy_nocow_ctx *ctx);
249static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 257static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
250 int mirror_num, u64 physical_for_dev_replace); 258 int mirror_num, u64 physical_for_dev_replace);
251static void copy_nocow_pages_worker(struct btrfs_work *work); 259static void copy_nocow_pages_worker(struct btrfs_work *work);
@@ -3126,12 +3134,30 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3126 nocow_ctx->mirror_num = mirror_num; 3134 nocow_ctx->mirror_num = mirror_num;
3127 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3135 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3128 nocow_ctx->work.func = copy_nocow_pages_worker; 3136 nocow_ctx->work.func = copy_nocow_pages_worker;
3137 INIT_LIST_HEAD(&nocow_ctx->inodes);
3129 btrfs_queue_worker(&fs_info->scrub_nocow_workers, 3138 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3130 &nocow_ctx->work); 3139 &nocow_ctx->work);
3131 3140
3132 return 0; 3141 return 0;
3133} 3142}
3134 3143
3144static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3145{
3146 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3147 struct scrub_nocow_inode *nocow_inode;
3148
3149 nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3150 if (!nocow_inode)
3151 return -ENOMEM;
3152 nocow_inode->inum = inum;
3153 nocow_inode->offset = offset;
3154 nocow_inode->root = root;
3155 list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3156 return 0;
3157}
3158
3159#define COPY_COMPLETE 1
3160
3135static void copy_nocow_pages_worker(struct btrfs_work *work) 3161static void copy_nocow_pages_worker(struct btrfs_work *work)
3136{ 3162{
3137 struct scrub_copy_nocow_ctx *nocow_ctx = 3163 struct scrub_copy_nocow_ctx *nocow_ctx =
@@ -3167,8 +3193,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
3167 } 3193 }
3168 3194
3169 ret = iterate_inodes_from_logical(logical, fs_info, path, 3195 ret = iterate_inodes_from_logical(logical, fs_info, path,
3170 copy_nocow_pages_for_inode, 3196 record_inode_for_nocow, nocow_ctx);
3171 nocow_ctx);
3172 if (ret != 0 && ret != -ENOENT) { 3197 if (ret != 0 && ret != -ENOENT) {
3173 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n", 3198 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
3174 logical, physical_for_dev_replace, len, mirror_num, 3199 logical, physical_for_dev_replace, len, mirror_num,
@@ -3177,7 +3202,33 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
3177 goto out; 3202 goto out;
3178 } 3203 }
3179 3204
3205 btrfs_end_transaction(trans, root);
3206 trans = NULL;
3207 while (!list_empty(&nocow_ctx->inodes)) {
3208 struct scrub_nocow_inode *entry;
3209 entry = list_first_entry(&nocow_ctx->inodes,
3210 struct scrub_nocow_inode,
3211 list);
3212 list_del_init(&entry->list);
3213 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
3214 entry->root, nocow_ctx);
3215 kfree(entry);
3216 if (ret == COPY_COMPLETE) {
3217 ret = 0;
3218 break;
3219 } else if (ret) {
3220 break;
3221 }
3222 }
3180out: 3223out:
3224 while (!list_empty(&nocow_ctx->inodes)) {
3225 struct scrub_nocow_inode *entry;
3226 entry = list_first_entry(&nocow_ctx->inodes,
3227 struct scrub_nocow_inode,
3228 list);
3229 list_del_init(&entry->list);
3230 kfree(entry);
3231 }
3181 if (trans && !IS_ERR(trans)) 3232 if (trans && !IS_ERR(trans))
3182 btrfs_end_transaction(trans, root); 3233 btrfs_end_transaction(trans, root);
3183 if (not_written) 3234 if (not_written)
@@ -3190,20 +3241,25 @@ out:
3190 scrub_pending_trans_workers_dec(sctx); 3241 scrub_pending_trans_workers_dec(sctx);
3191} 3242}
3192 3243
3193static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) 3244static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3245 struct scrub_copy_nocow_ctx *nocow_ctx)
3194{ 3246{
3195 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3196 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 3247 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3197 struct btrfs_key key; 3248 struct btrfs_key key;
3198 struct inode *inode; 3249 struct inode *inode;
3199 struct page *page; 3250 struct page *page;
3200 struct btrfs_root *local_root; 3251 struct btrfs_root *local_root;
3252 struct btrfs_ordered_extent *ordered;
3253 struct extent_map *em;
3254 struct extent_state *cached_state = NULL;
3255 struct extent_io_tree *io_tree;
3201 u64 physical_for_dev_replace; 3256 u64 physical_for_dev_replace;
3202 u64 len; 3257 u64 len = nocow_ctx->len;
3258 u64 lockstart = offset, lockend = offset + len - 1;
3203 unsigned long index; 3259 unsigned long index;
3204 int srcu_index; 3260 int srcu_index;
3205 int ret; 3261 int ret = 0;
3206 int err; 3262 int err = 0;
3207 3263
3208 key.objectid = root; 3264 key.objectid = root;
3209 key.type = BTRFS_ROOT_ITEM_KEY; 3265 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3229,9 +3285,33 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3229 mutex_lock(&inode->i_mutex); 3285 mutex_lock(&inode->i_mutex);
3230 inode_dio_wait(inode); 3286 inode_dio_wait(inode);
3231 3287
3232 ret = 0;
3233 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 3288 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3234 len = nocow_ctx->len; 3289 io_tree = &BTRFS_I(inode)->io_tree;
3290
3291 lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
3292 ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
3293 if (ordered) {
3294 btrfs_put_ordered_extent(ordered);
3295 goto out_unlock;
3296 }
3297
3298 em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
3299 if (IS_ERR(em)) {
3300 ret = PTR_ERR(em);
3301 goto out_unlock;
3302 }
3303
3304 /*
3305 * This extent does not actually cover the logical extent anymore,
3306 * move on to the next inode.
3307 */
3308 if (em->block_start > nocow_ctx->logical ||
3309 em->block_start + em->block_len < nocow_ctx->logical + len) {
3310 free_extent_map(em);
3311 goto out_unlock;
3312 }
3313 free_extent_map(em);
3314
3235 while (len >= PAGE_CACHE_SIZE) { 3315 while (len >= PAGE_CACHE_SIZE) {
3236 index = offset >> PAGE_CACHE_SHIFT; 3316 index = offset >> PAGE_CACHE_SHIFT;
3237again: 3317again:
@@ -3247,10 +3327,9 @@ again:
3247 goto next_page; 3327 goto next_page;
3248 } else { 3328 } else {
3249 ClearPageError(page); 3329 ClearPageError(page);
3250 err = extent_read_full_page(&BTRFS_I(inode)-> 3330 err = extent_read_full_page_nolock(io_tree, page,
3251 io_tree, 3331 btrfs_get_extent,
3252 page, btrfs_get_extent, 3332 nocow_ctx->mirror_num);
3253 nocow_ctx->mirror_num);
3254 if (err) { 3333 if (err) {
3255 ret = err; 3334 ret = err;
3256 goto next_page; 3335 goto next_page;
@@ -3264,6 +3343,7 @@ again:
3264 * page in the page cache. 3343 * page in the page cache.
3265 */ 3344 */
3266 if (page->mapping != inode->i_mapping) { 3345 if (page->mapping != inode->i_mapping) {
3346 unlock_page(page);
3267 page_cache_release(page); 3347 page_cache_release(page);
3268 goto again; 3348 goto again;
3269 } 3349 }
@@ -3287,6 +3367,10 @@ next_page:
3287 physical_for_dev_replace += PAGE_CACHE_SIZE; 3367 physical_for_dev_replace += PAGE_CACHE_SIZE;
3288 len -= PAGE_CACHE_SIZE; 3368 len -= PAGE_CACHE_SIZE;
3289 } 3369 }
3370 ret = COPY_COMPLETE;
3371out_unlock:
3372 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
3373 GFP_NOFS);
3290out: 3374out:
3291 mutex_unlock(&inode->i_mutex); 3375 mutex_unlock(&inode->i_mutex);
3292 iput(inode); 3376 iput(inode);