aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2008-07-21 03:05:25 -0400
committerNeilBrown <neilb@suse.de>2008-07-21 03:05:25 -0400
commit4b80991c6cb9efa607bc4fd6f3ecdf5511c31bb0 (patch)
tree5e2ba7d509af245c29bdf04b00960cc367972c44
parentf2ea68cf42aafdd93393b6b8b20fc3c2b5f4390c (diff)
md: Protect access to mddev->disks list using RCU
All modifications and most access to the mddev->disks list are made under the reconfig_mutex lock. However there are three places where the list is walked without any locking. If a reconfig happens at this time, havoc (and oops) can ensue. So use RCU to protect these accesses: - wrap them in rcu_read_{,un}lock() - use list_for_each_entry_rcu - add to the list with list_add_rcu - delete from the list with list_del_rcu - delay the 'free' with call_rcu rather than schedule_work Note that export_rdev did a list_del_init on this list. In almost all cases the entry was not in the list anymore so it was a no-op and so safe. It is no longer safe as after list_del_rcu we may not touch the list_head. An audit shows that export_rdev is called: - after unbind_rdev_from_array, in which case the delete has already been done, - after bind_rdev_to_array fails, in which case the delete isn't needed. - before the device has been put on a list at all (e.g. in add_new_disk where reading the superblock fails). - and in autorun devices after a failure when the device is on a different list. So remove the list_del_init call from export_rdev, and add it back immediately before the called to export_rdev for that last case. Note also that ->same_set is sometimes used for lists other than mddev->list (e.g. candidates). In these cases rcu is not needed. Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/bitmap.c15
-rw-r--r--drivers/md/md.c30
-rw-r--r--include/linux/raid/md_k.h3
3 files changed, 31 insertions, 17 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index eba83e25b678..621a272a2c74 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
241static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) 241static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
242{ 242{
243 mdk_rdev_t *rdev; 243 mdk_rdev_t *rdev;
244 struct list_head *tmp;
245 mddev_t *mddev = bitmap->mddev; 244 mddev_t *mddev = bitmap->mddev;
246 245
247 rdev_for_each(rdev, tmp, mddev) 246 rcu_read_lock();
247 rdev_for_each_rcu(rdev, mddev)
248 if (test_bit(In_sync, &rdev->flags) 248 if (test_bit(In_sync, &rdev->flags)
249 && !test_bit(Faulty, &rdev->flags)) { 249 && !test_bit(Faulty, &rdev->flags)) {
250 int size = PAGE_SIZE; 250 int size = PAGE_SIZE;
@@ -260,11 +260,11 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
260 + (long)(page->index * (PAGE_SIZE/512)) 260 + (long)(page->index * (PAGE_SIZE/512))
261 + size/512 > 0) 261 + size/512 > 0)
262 /* bitmap runs in to metadata */ 262 /* bitmap runs in to metadata */
263 return -EINVAL; 263 goto bad_alignment;
264 if (rdev->data_offset + mddev->size*2 264 if (rdev->data_offset + mddev->size*2
265 > rdev->sb_start + bitmap->offset) 265 > rdev->sb_start + bitmap->offset)
266 /* data runs in to bitmap */ 266 /* data runs in to bitmap */
267 return -EINVAL; 267 goto bad_alignment;
268 } else if (rdev->sb_start < rdev->data_offset) { 268 } else if (rdev->sb_start < rdev->data_offset) {
269 /* METADATA BITMAP DATA */ 269 /* METADATA BITMAP DATA */
270 if (rdev->sb_start 270 if (rdev->sb_start
@@ -272,7 +272,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
272 + page->index*(PAGE_SIZE/512) + size/512 272 + page->index*(PAGE_SIZE/512) + size/512
273 > rdev->data_offset) 273 > rdev->data_offset)
274 /* bitmap runs in to data */ 274 /* bitmap runs in to data */
275 return -EINVAL; 275 goto bad_alignment;
276 } else { 276 } else {
277 /* DATA METADATA BITMAP - no problems */ 277 /* DATA METADATA BITMAP - no problems */
278 } 278 }
@@ -282,10 +282,15 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
282 size, 282 size,
283 page); 283 page);
284 } 284 }
285 rcu_read_unlock();
285 286
286 if (wait) 287 if (wait)
287 md_super_wait(mddev); 288 md_super_wait(mddev);
288 return 0; 289 return 0;
290
291 bad_alignment:
292 rcu_read_unlock();
293 return -EINVAL;
289} 294}
290 295
291static void bitmap_file_kick(struct bitmap *bitmap); 296static void bitmap_file_kick(struct bitmap *bitmap);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 450f30b6edf9..c2ff77ccec50 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1395,15 +1395,17 @@ static struct super_type super_types[] = {
1395 1395
1396static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1396static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1397{ 1397{
1398 struct list_head *tmp, *tmp2;
1399 mdk_rdev_t *rdev, *rdev2; 1398 mdk_rdev_t *rdev, *rdev2;
1400 1399
1401 rdev_for_each(rdev, tmp, mddev1) 1400 rcu_read_lock();
1402 rdev_for_each(rdev2, tmp2, mddev2) 1401 rdev_for_each_rcu(rdev, mddev1)
1402 rdev_for_each_rcu(rdev2, mddev2)
1403 if (rdev->bdev->bd_contains == 1403 if (rdev->bdev->bd_contains ==
1404 rdev2->bdev->bd_contains) 1404 rdev2->bdev->bd_contains) {
1405 rcu_read_unlock();
1405 return 1; 1406 return 1;
1406 1407 }
1408 rcu_read_unlock();
1407 return 0; 1409 return 0;
1408} 1410}
1409 1411
@@ -1470,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1470 kobject_del(&rdev->kobj); 1472 kobject_del(&rdev->kobj);
1471 goto fail; 1473 goto fail;
1472 } 1474 }
1473 list_add(&rdev->same_set, &mddev->disks); 1475 list_add_rcu(&rdev->same_set, &mddev->disks);
1474 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1476 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1475 return 0; 1477 return 0;
1476 1478
@@ -1495,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1495 return; 1497 return;
1496 } 1498 }
1497 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1499 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1498 list_del_init(&rdev->same_set); 1500 list_del_rcu(&rdev->same_set);
1499 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1501 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1500 rdev->mddev = NULL; 1502 rdev->mddev = NULL;
1501 sysfs_remove_link(&rdev->kobj, "block"); 1503 sysfs_remove_link(&rdev->kobj, "block");
1502 1504
1503 /* We need to delay this, otherwise we can deadlock when 1505 /* We need to delay this, otherwise we can deadlock when
1504 * writing to 'remove' to "dev/state" 1506 * writing to 'remove' to "dev/state". We also need
1507 * to delay it due to rcu usage.
1505 */ 1508 */
1509 synchronize_rcu();
1506 INIT_WORK(&rdev->del_work, md_delayed_delete); 1510 INIT_WORK(&rdev->del_work, md_delayed_delete);
1507 kobject_get(&rdev->kobj); 1511 kobject_get(&rdev->kobj);
1508 schedule_work(&rdev->del_work); 1512 schedule_work(&rdev->del_work);
@@ -1558,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
1558 if (rdev->mddev) 1562 if (rdev->mddev)
1559 MD_BUG(); 1563 MD_BUG();
1560 free_disk_sb(rdev); 1564 free_disk_sb(rdev);
1561 list_del_init(&rdev->same_set);
1562#ifndef MODULE 1565#ifndef MODULE
1563 if (test_bit(AutoDetected, &rdev->flags)) 1566 if (test_bit(AutoDetected, &rdev->flags))
1564 md_autodetect_dev(rdev->bdev->bd_dev); 1567 md_autodetect_dev(rdev->bdev->bd_dev);
@@ -4062,8 +4065,10 @@ static void autorun_devices(int part)
4062 /* on success, candidates will be empty, on error 4065 /* on success, candidates will be empty, on error
4063 * it won't... 4066 * it won't...
4064 */ 4067 */
4065 rdev_for_each_list(rdev, tmp, candidates) 4068 rdev_for_each_list(rdev, tmp, candidates) {
4069 list_del_init(&rdev->same_set);
4066 export_rdev(rdev); 4070 export_rdev(rdev);
4071 }
4067 mddev_put(mddev); 4072 mddev_put(mddev);
4068 } 4073 }
4069 printk(KERN_INFO "md: ... autorun DONE.\n"); 4074 printk(KERN_INFO "md: ... autorun DONE.\n");
@@ -5529,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
5529static int is_mddev_idle(mddev_t *mddev) 5534static int is_mddev_idle(mddev_t *mddev)
5530{ 5535{
5531 mdk_rdev_t * rdev; 5536 mdk_rdev_t * rdev;
5532 struct list_head *tmp;
5533 int idle; 5537 int idle;
5534 long curr_events; 5538 long curr_events;
5535 5539
5536 idle = 1; 5540 idle = 1;
5537 rdev_for_each(rdev, tmp, mddev) { 5541 rcu_read_lock();
5542 rdev_for_each_rcu(rdev, mddev) {
5538 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 5543 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5539 curr_events = disk_stat_read(disk, sectors[0]) + 5544 curr_events = disk_stat_read(disk, sectors[0]) +
5540 disk_stat_read(disk, sectors[1]) - 5545 disk_stat_read(disk, sectors[1]) -
@@ -5566,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
5566 idle = 0; 5571 idle = 0;
5567 } 5572 }
5568 } 5573 }
5574 rcu_read_unlock();
5569 return idle; 5575 return idle;
5570} 5576}
5571 5577
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4bef4791d80d..9f2549ac0e2d 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -339,6 +339,9 @@ static inline char * mdname (mddev_t * mddev)
339#define rdev_for_each(rdev, tmp, mddev) \ 339#define rdev_for_each(rdev, tmp, mddev) \
340 rdev_for_each_list(rdev, tmp, (mddev)->disks) 340 rdev_for_each_list(rdev, tmp, (mddev)->disks)
341 341
342#define rdev_for_each_rcu(rdev, mddev) \
343 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
344
342typedef struct mdk_thread_s { 345typedef struct mdk_thread_s {
343 void (*run) (mddev_t *mddev); 346 void (*run) (mddev_t *mddev);
344 mddev_t *mddev; 347 mddev_t *mddev;