aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorMiao Xie <miaox@cn.fujitsu.com>2014-01-30 03:46:55 -0500
committerJosef Bacik <jbacik@fb.com>2014-03-10 15:15:39 -0400
commitc404e0dc2c843b154f9a36c3aec10d0a715d88eb (patch)
tree643a2ab96708ef72c50679dd8da28e5d519fcf72 /fs/btrfs
parent391cd9df81ac07ce7e66ac8fb13e56693061a6e6 (diff)
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.h9
-rw-r--r--fs/btrfs/dev-replace.c74
-rw-r--r--fs/btrfs/disk-io.c12
-rw-r--r--fs/btrfs/volumes.c30
-rw-r--r--fs/btrfs/volumes.h1
5 files changed, 111 insertions, 15 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fceddbdfdd3d..dac6653d4cce 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
351#define BTRFS_FS_STATE_ERROR 0 351#define BTRFS_FS_STATE_ERROR 0
352#define BTRFS_FS_STATE_REMOUNTING 1 352#define BTRFS_FS_STATE_REMOUNTING 1
353#define BTRFS_FS_STATE_TRANS_ABORTED 2 353#define BTRFS_FS_STATE_TRANS_ABORTED 2
354#define BTRFS_FS_STATE_DEV_REPLACING 3
354 355
355/* Super block flags */ 356/* Super block flags */
356/* Errors detected */ 357/* Errors detected */
@@ -1674,6 +1675,9 @@ struct btrfs_fs_info {
1674 1675
1675 atomic_t mutually_exclusive_operation_running; 1676 atomic_t mutually_exclusive_operation_running;
1676 1677
1678 struct percpu_counter bio_counter;
1679 wait_queue_head_t replace_wait;
1680
1677 struct semaphore uuid_tree_rescan_sem; 1681 struct semaphore uuid_tree_rescan_sem;
1678 unsigned int update_uuid_tree_gen:1; 1682 unsigned int update_uuid_tree_gen:1;
1679}; 1683};
@@ -4008,6 +4012,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
4008int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 4012int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4009 struct btrfs_scrub_progress *progress); 4013 struct btrfs_scrub_progress *progress);
4010 4014
4015/* dev-replace.c */
4016void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4017void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4018void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
4019
4011/* reada.c */ 4020/* reada.c */
4012struct reada_control { 4021struct reada_control {
4013 struct btrfs_root *root; /* tree to prefetch */ 4022 struct btrfs_root *root; /* tree to prefetch */
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index b20d59e5e5dd..ec1c3f3a775d 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
431 return ret; 431 return ret;
432} 432}
433 433
434/*
435 * blocked until all flighting bios are finished.
436 */
437static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
438{
439 s64 writers;
440 DEFINE_WAIT(wait);
441
442 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
443 do {
444 prepare_to_wait(&fs_info->replace_wait, &wait,
445 TASK_UNINTERRUPTIBLE);
446 writers = percpu_counter_sum(&fs_info->bio_counter);
447 if (writers)
448 schedule();
449 finish_wait(&fs_info->replace_wait, &wait);
450 } while (writers);
451}
452
453/*
454 * we have removed target device, it is safe to allow new bios request.
455 */
456static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
457{
458 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
459 if (waitqueue_active(&fs_info->replace_wait))
460 wake_up(&fs_info->replace_wait);
461}
462
434static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 463static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
435 int scrub_ret) 464 int scrub_ret)
436{ 465{
@@ -458,12 +487,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
458 src_device = dev_replace->srcdev; 487 src_device = dev_replace->srcdev;
459 btrfs_dev_replace_unlock(dev_replace); 488 btrfs_dev_replace_unlock(dev_replace);
460 489
461 /* replace old device with new one in mapping tree */
462 if (!scrub_ret)
463 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
464 src_device,
465 tgt_device);
466
467 /* 490 /*
468 * flush all outstanding I/O and inode extent mappings before the 491 * flush all outstanding I/O and inode extent mappings before the
469 * copy operation is declared as being finished 492 * copy operation is declared as being finished
@@ -495,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
495 dev_replace->time_stopped = get_seconds(); 518 dev_replace->time_stopped = get_seconds();
496 dev_replace->item_needs_writeback = 1; 519 dev_replace->item_needs_writeback = 1;
497 520
498 if (scrub_ret) { 521 /* replace old device with new one in mapping tree */
522 if (!scrub_ret) {
523 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
524 src_device,
525 tgt_device);
526 } else {
499 printk_in_rcu(KERN_ERR 527 printk_in_rcu(KERN_ERR
500 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 528 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
501 src_device->missing ? "<missing disk>" : 529 src_device->missing ? "<missing disk>" :
@@ -534,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
534 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 562 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
535 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 563 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
536 564
565 btrfs_rm_dev_replace_blocked(fs_info);
566
537 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 567 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
538 568
569 btrfs_rm_dev_replace_unblocked(fs_info);
570
539 /* 571 /*
540 * this is again a consistent state where no dev_replace procedure 572 * this is again a consistent state where no dev_replace procedure
541 * is running, the target device is part of the filesystem, the 573 * is running, the target device is part of the filesystem, the
@@ -865,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
865 mutex_unlock(&dev_replace->lock_management_lock); 897 mutex_unlock(&dev_replace->lock_management_lock);
866 } 898 }
867} 899}
900
901void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
902{
903 percpu_counter_inc(&fs_info->bio_counter);
904}
905
906void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
907{
908 percpu_counter_dec(&fs_info->bio_counter);
909
910 if (waitqueue_active(&fs_info->replace_wait))
911 wake_up(&fs_info->replace_wait);
912}
913
914void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
915{
916 DEFINE_WAIT(wait);
917again:
918 percpu_counter_inc(&fs_info->bio_counter);
919 if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
920 btrfs_bio_counter_dec(fs_info);
921 wait_event(fs_info->replace_wait,
922 !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
923 &fs_info->fs_state));
924 goto again;
925 }
926
927}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fcf367581073..0cafacb07b43 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2136,10 +2136,16 @@ int open_ctree(struct super_block *sb,
2136 goto fail_dirty_metadata_bytes; 2136 goto fail_dirty_metadata_bytes;
2137 } 2137 }
2138 2138
2139 ret = percpu_counter_init(&fs_info->bio_counter, 0);
2140 if (ret) {
2141 err = ret;
2142 goto fail_delalloc_bytes;
2143 }
2144
2139 fs_info->btree_inode = new_inode(sb); 2145 fs_info->btree_inode = new_inode(sb);
2140 if (!fs_info->btree_inode) { 2146 if (!fs_info->btree_inode) {
2141 err = -ENOMEM; 2147 err = -ENOMEM;
2142 goto fail_delalloc_bytes; 2148 goto fail_bio_counter;
2143 } 2149 }
2144 2150
2145 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2151 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2214,6 +2220,7 @@ int open_ctree(struct super_block *sb,
2214 atomic_set(&fs_info->scrub_pause_req, 0); 2220 atomic_set(&fs_info->scrub_pause_req, 0);
2215 atomic_set(&fs_info->scrubs_paused, 0); 2221 atomic_set(&fs_info->scrubs_paused, 0);
2216 atomic_set(&fs_info->scrub_cancel_req, 0); 2222 atomic_set(&fs_info->scrub_cancel_req, 0);
2223 init_waitqueue_head(&fs_info->replace_wait);
2217 init_waitqueue_head(&fs_info->scrub_pause_wait); 2224 init_waitqueue_head(&fs_info->scrub_pause_wait);
2218 fs_info->scrub_workers_refcnt = 0; 2225 fs_info->scrub_workers_refcnt = 0;
2219#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2226#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2966,6 +2973,8 @@ fail_iput:
2966 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2973 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2967 2974
2968 iput(fs_info->btree_inode); 2975 iput(fs_info->btree_inode);
2976fail_bio_counter:
2977 percpu_counter_destroy(&fs_info->bio_counter);
2969fail_delalloc_bytes: 2978fail_delalloc_bytes:
2970 percpu_counter_destroy(&fs_info->delalloc_bytes); 2979 percpu_counter_destroy(&fs_info->delalloc_bytes);
2971fail_dirty_metadata_bytes: 2980fail_dirty_metadata_bytes:
@@ -3613,6 +3622,7 @@ int close_ctree(struct btrfs_root *root)
3613 3622
3614 percpu_counter_destroy(&fs_info->dirty_metadata_bytes); 3623 percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3615 percpu_counter_destroy(&fs_info->delalloc_bytes); 3624 percpu_counter_destroy(&fs_info->delalloc_bytes);
3625 percpu_counter_destroy(&fs_info->bio_counter);
3616 bdi_destroy(&fs_info->bdi); 3626 bdi_destroy(&fs_info->bdi);
3617 cleanup_srcu_struct(&fs_info->subvol_srcu); 3627 cleanup_srcu_struct(&fs_info->subvol_srcu);
3618 3628
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b68afe32419f..07629e99809a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5263,6 +5263,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5263static void btrfs_end_bio(struct bio *bio, int err) 5263static void btrfs_end_bio(struct bio *bio, int err)
5264{ 5264{
5265 struct btrfs_bio *bbio = bio->bi_private; 5265 struct btrfs_bio *bbio = bio->bi_private;
5266 struct btrfs_device *dev = bbio->stripes[0].dev;
5266 int is_orig_bio = 0; 5267 int is_orig_bio = 0;
5267 5268
5268 if (err) { 5269 if (err) {
@@ -5270,7 +5271,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
5270 if (err == -EIO || err == -EREMOTEIO) { 5271 if (err == -EIO || err == -EREMOTEIO) {
5271 unsigned int stripe_index = 5272 unsigned int stripe_index =
5272 btrfs_io_bio(bio)->stripe_index; 5273 btrfs_io_bio(bio)->stripe_index;
5273 struct btrfs_device *dev;
5274 5274
5275 BUG_ON(stripe_index >= bbio->num_stripes); 5275 BUG_ON(stripe_index >= bbio->num_stripes);
5276 dev = bbio->stripes[stripe_index].dev; 5276 dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5292,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
5292 if (bio == bbio->orig_bio) 5292 if (bio == bbio->orig_bio)
5293 is_orig_bio = 1; 5293 is_orig_bio = 1;
5294 5294
5295 btrfs_bio_counter_dec(bbio->fs_info);
5296
5295 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5297 if (atomic_dec_and_test(&bbio->stripes_pending)) {
5296 if (!is_orig_bio) { 5298 if (!is_orig_bio) {
5297 bio_put(bio); 5299 bio_put(bio);
@@ -5440,6 +5442,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5440 } 5442 }
5441#endif 5443#endif
5442 bio->bi_bdev = dev->bdev; 5444 bio->bi_bdev = dev->bdev;
5445
5446 btrfs_bio_counter_inc_noblocked(root->fs_info);
5447
5443 if (async) 5448 if (async)
5444 btrfs_schedule_bio(root, dev, rw, bio); 5449 btrfs_schedule_bio(root, dev, rw, bio);
5445 else 5450 else
@@ -5508,28 +5513,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5508 length = bio->bi_size; 5513 length = bio->bi_size;
5509 map_length = length; 5514 map_length = length;
5510 5515
5516 btrfs_bio_counter_inc_blocked(root->fs_info);
5511 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5517 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5512 mirror_num, &raid_map); 5518 mirror_num, &raid_map);
5513 if (ret) /* -ENOMEM */ 5519 if (ret) {
5520 btrfs_bio_counter_dec(root->fs_info);
5514 return ret; 5521 return ret;
5522 }
5515 5523
5516 total_devs = bbio->num_stripes; 5524 total_devs = bbio->num_stripes;
5517 bbio->orig_bio = first_bio; 5525 bbio->orig_bio = first_bio;
5518 bbio->private = first_bio->bi_private; 5526 bbio->private = first_bio->bi_private;
5519 bbio->end_io = first_bio->bi_end_io; 5527 bbio->end_io = first_bio->bi_end_io;
5528 bbio->fs_info = root->fs_info;
5520 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5529 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5521 5530
5522 if (raid_map) { 5531 if (raid_map) {
5523 /* In this case, map_length has been set to the length of 5532 /* In this case, map_length has been set to the length of
5524 a single stripe; not the whole write */ 5533 a single stripe; not the whole write */
5525 if (rw & WRITE) { 5534 if (rw & WRITE) {
5526 return raid56_parity_write(root, bio, bbio, 5535 ret = raid56_parity_write(root, bio, bbio,
5527 raid_map, map_length); 5536 raid_map, map_length);
5528 } else { 5537 } else {
5529 return raid56_parity_recover(root, bio, bbio, 5538 ret = raid56_parity_recover(root, bio, bbio,
5530 raid_map, map_length, 5539 raid_map, map_length,
5531 mirror_num); 5540 mirror_num);
5532 } 5541 }
5542 /*
5543 * FIXME, replace dosen't support raid56 yet, please fix
5544 * it in the future.
5545 */
5546 btrfs_bio_counter_dec(root->fs_info);
5547 return ret;
5533 } 5548 }
5534 5549
5535 if (map_length < length) { 5550 if (map_length < length) {
@@ -5571,6 +5586,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5571 async_submit); 5586 async_submit);
5572 dev_nr++; 5587 dev_nr++;
5573 } 5588 }
5589 btrfs_bio_counter_dec(root->fs_info);
5574 return 0; 5590 return 0;
5575} 5591}
5576 5592
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
192 192
193struct btrfs_bio { 193struct btrfs_bio {
194 atomic_t stripes_pending; 194 atomic_t stripes_pending;
195 struct btrfs_fs_info *fs_info;
195 bio_end_io_t *end_io; 196 bio_end_io_t *end_io;
196 struct bio *orig_bio; 197 struct bio *orig_bio;
197 void *private; 198 void *private;