diff options
| author | NeilBrown <neilb@suse.de> | 2008-07-21 03:05:25 -0400 |
|---|---|---|
| committer | NeilBrown <neilb@suse.de> | 2008-07-21 03:05:25 -0400 |
| commit | 4b80991c6cb9efa607bc4fd6f3ecdf5511c31bb0 (patch) | |
| tree | 5e2ba7d509af245c29bdf04b00960cc367972c44 | |
| parent | f2ea68cf42aafdd93393b6b8b20fc3c2b5f4390c (diff) | |
md: Protect access to mddev->disks list using RCU
All modifications and most access to the mddev->disks list are made
under the reconfig_mutex lock. However there are three places where
the list is walked without any locking. If a reconfig happens at this
time, havoc (and oops) can ensue.
So use RCU to protect these accesses:
- wrap them in rcu_read_{,un}lock()
- use list_for_each_entry_rcu
- add to the list with list_add_rcu
- delete from the list with list_del_rcu
- delay the 'free' with call_rcu rather than schedule_work
Note that export_rdev did a list_del_init on this list. In almost all
cases the entry was not in the list anymore so it was a no-op and so
safe. It is no longer safe as after list_del_rcu we may not touch
the list_head.
An audit shows that export_rdev is called:
- after unbind_rdev_from_array, in which case the delete has
already been done,
- after bind_rdev_to_array fails, in which case the delete isn't needed.
- before the device has been put on a list at all (e.g. in
add_new_disk where reading the superblock fails).
- and in autorun devices after a failure when the device is on a
different list.
So remove the list_del_init call from export_rdev, and add it back
immediately before the called to export_rdev for that last case.
Note also that ->same_set is sometimes used for lists other than
mddev->list (e.g. candidates). In these cases rcu is not needed.
Signed-off-by: NeilBrown <neilb@suse.de>
| -rw-r--r-- | drivers/md/bitmap.c | 15 | ||||
| -rw-r--r-- | drivers/md/md.c | 30 | ||||
| -rw-r--r-- | include/linux/raid/md_k.h | 3 |
3 files changed, 31 insertions, 17 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index eba83e25b678..621a272a2c74 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
| @@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde | |||
| 241 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | 241 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) |
| 242 | { | 242 | { |
| 243 | mdk_rdev_t *rdev; | 243 | mdk_rdev_t *rdev; |
| 244 | struct list_head *tmp; | ||
| 245 | mddev_t *mddev = bitmap->mddev; | 244 | mddev_t *mddev = bitmap->mddev; |
| 246 | 245 | ||
| 247 | rdev_for_each(rdev, tmp, mddev) | 246 | rcu_read_lock(); |
| 247 | rdev_for_each_rcu(rdev, mddev) | ||
| 248 | if (test_bit(In_sync, &rdev->flags) | 248 | if (test_bit(In_sync, &rdev->flags) |
| 249 | && !test_bit(Faulty, &rdev->flags)) { | 249 | && !test_bit(Faulty, &rdev->flags)) { |
| 250 | int size = PAGE_SIZE; | 250 | int size = PAGE_SIZE; |
| @@ -260,11 +260,11 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 260 | + (long)(page->index * (PAGE_SIZE/512)) | 260 | + (long)(page->index * (PAGE_SIZE/512)) |
| 261 | + size/512 > 0) | 261 | + size/512 > 0) |
| 262 | /* bitmap runs in to metadata */ | 262 | /* bitmap runs in to metadata */ |
| 263 | return -EINVAL; | 263 | goto bad_alignment; |
| 264 | if (rdev->data_offset + mddev->size*2 | 264 | if (rdev->data_offset + mddev->size*2 |
| 265 | > rdev->sb_start + bitmap->offset) | 265 | > rdev->sb_start + bitmap->offset) |
| 266 | /* data runs in to bitmap */ | 266 | /* data runs in to bitmap */ |
| 267 | return -EINVAL; | 267 | goto bad_alignment; |
| 268 | } else if (rdev->sb_start < rdev->data_offset) { | 268 | } else if (rdev->sb_start < rdev->data_offset) { |
| 269 | /* METADATA BITMAP DATA */ | 269 | /* METADATA BITMAP DATA */ |
| 270 | if (rdev->sb_start | 270 | if (rdev->sb_start |
| @@ -272,7 +272,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 272 | + page->index*(PAGE_SIZE/512) + size/512 | 272 | + page->index*(PAGE_SIZE/512) + size/512 |
| 273 | > rdev->data_offset) | 273 | > rdev->data_offset) |
| 274 | /* bitmap runs in to data */ | 274 | /* bitmap runs in to data */ |
| 275 | return -EINVAL; | 275 | goto bad_alignment; |
| 276 | } else { | 276 | } else { |
| 277 | /* DATA METADATA BITMAP - no problems */ | 277 | /* DATA METADATA BITMAP - no problems */ |
| 278 | } | 278 | } |
| @@ -282,10 +282,15 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 282 | size, | 282 | size, |
| 283 | page); | 283 | page); |
| 284 | } | 284 | } |
| 285 | rcu_read_unlock(); | ||
| 285 | 286 | ||
| 286 | if (wait) | 287 | if (wait) |
| 287 | md_super_wait(mddev); | 288 | md_super_wait(mddev); |
| 288 | return 0; | 289 | return 0; |
| 290 | |||
| 291 | bad_alignment: | ||
| 292 | rcu_read_unlock(); | ||
| 293 | return -EINVAL; | ||
| 289 | } | 294 | } |
| 290 | 295 | ||
| 291 | static void bitmap_file_kick(struct bitmap *bitmap); | 296 | static void bitmap_file_kick(struct bitmap *bitmap); |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 450f30b6edf9..c2ff77ccec50 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -1395,15 +1395,17 @@ static struct super_type super_types[] = { | |||
| 1395 | 1395 | ||
| 1396 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) | 1396 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) |
| 1397 | { | 1397 | { |
| 1398 | struct list_head *tmp, *tmp2; | ||
| 1399 | mdk_rdev_t *rdev, *rdev2; | 1398 | mdk_rdev_t *rdev, *rdev2; |
| 1400 | 1399 | ||
| 1401 | rdev_for_each(rdev, tmp, mddev1) | 1400 | rcu_read_lock(); |
| 1402 | rdev_for_each(rdev2, tmp2, mddev2) | 1401 | rdev_for_each_rcu(rdev, mddev1) |
| 1402 | rdev_for_each_rcu(rdev2, mddev2) | ||
| 1403 | if (rdev->bdev->bd_contains == | 1403 | if (rdev->bdev->bd_contains == |
| 1404 | rdev2->bdev->bd_contains) | 1404 | rdev2->bdev->bd_contains) { |
| 1405 | rcu_read_unlock(); | ||
| 1405 | return 1; | 1406 | return 1; |
| 1406 | 1407 | } | |
| 1408 | rcu_read_unlock(); | ||
| 1407 | return 0; | 1409 | return 0; |
| 1408 | } | 1410 | } |
| 1409 | 1411 | ||
| @@ -1470,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
| 1470 | kobject_del(&rdev->kobj); | 1472 | kobject_del(&rdev->kobj); |
| 1471 | goto fail; | 1473 | goto fail; |
| 1472 | } | 1474 | } |
| 1473 | list_add(&rdev->same_set, &mddev->disks); | 1475 | list_add_rcu(&rdev->same_set, &mddev->disks); |
| 1474 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); | 1476 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); |
| 1475 | return 0; | 1477 | return 0; |
| 1476 | 1478 | ||
| @@ -1495,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
| 1495 | return; | 1497 | return; |
| 1496 | } | 1498 | } |
| 1497 | bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); | 1499 | bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); |
| 1498 | list_del_init(&rdev->same_set); | 1500 | list_del_rcu(&rdev->same_set); |
| 1499 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); | 1501 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); |
| 1500 | rdev->mddev = NULL; | 1502 | rdev->mddev = NULL; |
| 1501 | sysfs_remove_link(&rdev->kobj, "block"); | 1503 | sysfs_remove_link(&rdev->kobj, "block"); |
| 1502 | 1504 | ||
| 1503 | /* We need to delay this, otherwise we can deadlock when | 1505 | /* We need to delay this, otherwise we can deadlock when |
| 1504 | * writing to 'remove' to "dev/state" | 1506 | * writing to 'remove' to "dev/state". We also need |
| 1507 | * to delay it due to rcu usage. | ||
| 1505 | */ | 1508 | */ |
| 1509 | synchronize_rcu(); | ||
| 1506 | INIT_WORK(&rdev->del_work, md_delayed_delete); | 1510 | INIT_WORK(&rdev->del_work, md_delayed_delete); |
| 1507 | kobject_get(&rdev->kobj); | 1511 | kobject_get(&rdev->kobj); |
| 1508 | schedule_work(&rdev->del_work); | 1512 | schedule_work(&rdev->del_work); |
| @@ -1558,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev) | |||
| 1558 | if (rdev->mddev) | 1562 | if (rdev->mddev) |
| 1559 | MD_BUG(); | 1563 | MD_BUG(); |
| 1560 | free_disk_sb(rdev); | 1564 | free_disk_sb(rdev); |
| 1561 | list_del_init(&rdev->same_set); | ||
| 1562 | #ifndef MODULE | 1565 | #ifndef MODULE |
| 1563 | if (test_bit(AutoDetected, &rdev->flags)) | 1566 | if (test_bit(AutoDetected, &rdev->flags)) |
| 1564 | md_autodetect_dev(rdev->bdev->bd_dev); | 1567 | md_autodetect_dev(rdev->bdev->bd_dev); |
| @@ -4062,8 +4065,10 @@ static void autorun_devices(int part) | |||
| 4062 | /* on success, candidates will be empty, on error | 4065 | /* on success, candidates will be empty, on error |
| 4063 | * it won't... | 4066 | * it won't... |
| 4064 | */ | 4067 | */ |
| 4065 | rdev_for_each_list(rdev, tmp, candidates) | 4068 | rdev_for_each_list(rdev, tmp, candidates) { |
| 4069 | list_del_init(&rdev->same_set); | ||
| 4066 | export_rdev(rdev); | 4070 | export_rdev(rdev); |
| 4071 | } | ||
| 4067 | mddev_put(mddev); | 4072 | mddev_put(mddev); |
| 4068 | } | 4073 | } |
| 4069 | printk(KERN_INFO "md: ... autorun DONE.\n"); | 4074 | printk(KERN_INFO "md: ... autorun DONE.\n"); |
| @@ -5529,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p) | |||
| 5529 | static int is_mddev_idle(mddev_t *mddev) | 5534 | static int is_mddev_idle(mddev_t *mddev) |
| 5530 | { | 5535 | { |
| 5531 | mdk_rdev_t * rdev; | 5536 | mdk_rdev_t * rdev; |
| 5532 | struct list_head *tmp; | ||
| 5533 | int idle; | 5537 | int idle; |
| 5534 | long curr_events; | 5538 | long curr_events; |
| 5535 | 5539 | ||
| 5536 | idle = 1; | 5540 | idle = 1; |
| 5537 | rdev_for_each(rdev, tmp, mddev) { | 5541 | rcu_read_lock(); |
| 5542 | rdev_for_each_rcu(rdev, mddev) { | ||
| 5538 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; | 5543 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; |
| 5539 | curr_events = disk_stat_read(disk, sectors[0]) + | 5544 | curr_events = disk_stat_read(disk, sectors[0]) + |
| 5540 | disk_stat_read(disk, sectors[1]) - | 5545 | disk_stat_read(disk, sectors[1]) - |
| @@ -5566,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev) | |||
| 5566 | idle = 0; | 5571 | idle = 0; |
| 5567 | } | 5572 | } |
| 5568 | } | 5573 | } |
| 5574 | rcu_read_unlock(); | ||
| 5569 | return idle; | 5575 | return idle; |
| 5570 | } | 5576 | } |
| 5571 | 5577 | ||
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4bef4791d80d..9f2549ac0e2d 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h | |||
| @@ -339,6 +339,9 @@ static inline char * mdname (mddev_t * mddev) | |||
| 339 | #define rdev_for_each(rdev, tmp, mddev) \ | 339 | #define rdev_for_each(rdev, tmp, mddev) \ |
| 340 | rdev_for_each_list(rdev, tmp, (mddev)->disks) | 340 | rdev_for_each_list(rdev, tmp, (mddev)->disks) |
| 341 | 341 | ||
| 342 | #define rdev_for_each_rcu(rdev, mddev) \ | ||
| 343 | list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) | ||
| 344 | |||
| 342 | typedef struct mdk_thread_s { | 345 | typedef struct mdk_thread_s { |
| 343 | void (*run) (mddev_t *mddev); | 346 | void (*run) (mddev_t *mddev); |
| 344 | mddev_t *mddev; | 347 | mddev_t *mddev; |
