aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2009-01-08 16:31:10 -0500
committerNeilBrown <neilb@suse.de>2009-01-08 16:31:10 -0500
commitd3374825ce57ba2214d375023979f6197ccc1385 (patch)
tree441ea927a7c702e4eadeafbac8be97d664bfb83b /drivers/md
parenta21d15042d8cd736caf82c2bac564f3f93f3d017 (diff)
md: make devices disappear when they are no longer needed.
Currently md devices, once created, never disappear until the module is unloaded. This is essentially because the gendisk holds a reference to the mddev, and the mddev holds a reference to the gendisk, this a circular reference. If we drop the reference from mddev to gendisk, then we need to ensure that the mddev is destroyed when the gendisk is destroyed. However it is not possible to hook into the gendisk destruction process to enable this. So we drop the reference from the gendisk to the mddev and destroy the gendisk when the mddev gets destroyed. However this has a complication. Between the call __blkdev_get->get_gendisk->kobj_lookup->md_probe and the call __blkdev_get->md_open there is no obvious way to hold a reference on the mddev any more, so unless something is done, it will disappear and gendisk will be destroyed prematurely. Also, once we decide to destroy the mddev, there will be an unlockable moment before the gendisk is unlinked (blk_unregister_region) during which a new reference to the gendisk can be created. We need to ensure that this reference can not be used. i.e. the ->open must fail. So: 1/ in md_probe we set a flag in the mddev (hold_active) which indicates that the array should be treated as active, even though there are no references, and no appearance of activity. This is cleared by md_release when the device is closed if it is no longer needed. This ensures that the gendisk will survive between md_probe and md_open. 2/ In md_open we check if the mddev we expect to open matches the gendisk that we did open. If there is a mismatch we return -ERESTARTSYS and modify __blkdev_get to retry from the top in that case. In the -ERESTARTSYS sys case we make sure to wait until the old gendisk (that we succeeded in opening) is really gone so we loop at most once. Some udev configurations will always open an md device when it first appears. If we allow an md device that was just created by an open to disappear on an immediate close, then this can race with such udev configurations and result in an infinite loop the device being opened and closed, then re-open due to the 'ADD' even from the first open, and then close and so on. So we make sure an md device, once created by an open, remains active at least until some md 'ioctl' has been made on it. This means that all normal usage of md devices will allow them to disappear promptly when not needed, but the worst that an incorrect usage will do it cause an inactive md device to be left in existence (it can easily be removed). As an array can be stopped by writing to a sysfs attribute echo clear > /sys/block/mdXXX/md/array_state we need to use scheduled work for deleting the gendisk and other kobjects. This allows us to wait for any pending gendisk deletion to complete by simply calling flush_scheduled_work(). Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/md.c61
1 files changed, 49 insertions, 12 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 970a8c42ba92..38697283aaf4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -214,16 +214,33 @@ static inline mddev_t *mddev_get(mddev_t *mddev)
214 return mddev; 214 return mddev;
215} 215}
216 216
217static void mddev_delayed_delete(struct work_struct *ws)
218{
219 mddev_t *mddev = container_of(ws, mddev_t, del_work);
220 kobject_del(&mddev->kobj);
221 kobject_put(&mddev->kobj);
222}
223
217static void mddev_put(mddev_t *mddev) 224static void mddev_put(mddev_t *mddev)
218{ 225{
219 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 226 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
220 return; 227 return;
221 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 228 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
229 !mddev->hold_active) {
222 list_del(&mddev->all_mddevs); 230 list_del(&mddev->all_mddevs);
223 spin_unlock(&all_mddevs_lock); 231 if (mddev->gendisk) {
224 kobject_put(&mddev->kobj); 232 /* we did a probe so need to clean up.
225 } else 233 * Call schedule_work inside the spinlock
226 spin_unlock(&all_mddevs_lock); 234 * so that flush_scheduled_work() after
235 * mddev_find will succeed in waiting for the
236 * work to be done.
237 */
238 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
239 schedule_work(&mddev->del_work);
240 } else
241 kfree(mddev);
242 }
243 spin_unlock(&all_mddevs_lock);
227} 244}
228 245
229static mddev_t * mddev_find(dev_t unit) 246static mddev_t * mddev_find(dev_t unit)
@@ -242,6 +259,7 @@ static mddev_t * mddev_find(dev_t unit)
242 259
243 if (new) { 260 if (new) {
244 list_add(&new->all_mddevs, &all_mddevs); 261 list_add(&new->all_mddevs, &all_mddevs);
262 mddev->hold_active = UNTIL_IOCTL;
245 spin_unlock(&all_mddevs_lock); 263 spin_unlock(&all_mddevs_lock);
246 return new; 264 return new;
247 } 265 }
@@ -3435,6 +3453,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
3435 if (!capable(CAP_SYS_ADMIN)) 3453 if (!capable(CAP_SYS_ADMIN))
3436 return -EACCES; 3454 return -EACCES;
3437 rv = mddev_lock(mddev); 3455 rv = mddev_lock(mddev);
3456 if (mddev->hold_active == UNTIL_IOCTL)
3457 mddev->hold_active = 0;
3438 if (!rv) { 3458 if (!rv) {
3439 rv = entry->store(mddev, page, length); 3459 rv = entry->store(mddev, page, length);
3440 mddev_unlock(mddev); 3460 mddev_unlock(mddev);
@@ -3484,6 +3504,11 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
3484 if (!mddev) 3504 if (!mddev)
3485 return NULL; 3505 return NULL;
3486 3506
3507 /* wait for any previous instance if this device
3508 * to be completed removed (mddev_delayed_delete).
3509 */
3510 flush_scheduled_work();
3511
3487 mutex_lock(&disks_mutex); 3512 mutex_lock(&disks_mutex);
3488 if (mddev->gendisk) { 3513 if (mddev->gendisk) {
3489 mutex_unlock(&disks_mutex); 3514 mutex_unlock(&disks_mutex);
@@ -3520,7 +3545,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
3520 disk->private_data = mddev; 3545 disk->private_data = mddev;
3521 disk->queue = mddev->queue; 3546 disk->queue = mddev->queue;
3522 /* Allow extended partitions. This makes the 3547 /* Allow extended partitions. This makes the
3523 * 'mdp' device redundant, but we can really 3548 * 'mdp' device redundant, but we can't really
3524 * remove it now. 3549 * remove it now.
3525 */ 3550 */
3526 disk->flags |= GENHD_FL_EXT_DEVT; 3551 disk->flags |= GENHD_FL_EXT_DEVT;
@@ -3536,6 +3561,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
3536 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3561 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3537 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 3562 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3538 } 3563 }
3564 mddev_put(mddev);
3539 return NULL; 3565 return NULL;
3540} 3566}
3541 3567
@@ -5054,6 +5080,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
5054 5080
5055done_unlock: 5081done_unlock:
5056abort_unlock: 5082abort_unlock:
5083 if (mddev->hold_active == UNTIL_IOCTL &&
5084 err != -EINVAL)
5085 mddev->hold_active = 0;
5057 mddev_unlock(mddev); 5086 mddev_unlock(mddev);
5058 5087
5059 return err; 5088 return err;
@@ -5070,14 +5099,25 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5070 * Succeed if we can lock the mddev, which confirms that 5099 * Succeed if we can lock the mddev, which confirms that
5071 * it isn't being stopped right now. 5100 * it isn't being stopped right now.
5072 */ 5101 */
5073 mddev_t *mddev = bdev->bd_disk->private_data; 5102 mddev_t *mddev = mddev_find(bdev->bd_dev);
5074 int err; 5103 int err;
5075 5104
5105 if (mddev->gendisk != bdev->bd_disk) {
5106 /* we are racing with mddev_put which is discarding this
5107 * bd_disk.
5108 */
5109 mddev_put(mddev);
5110 /* Wait until bdev->bd_disk is definitely gone */
5111 flush_scheduled_work();
5112 /* Then retry the open from the top */
5113 return -ERESTARTSYS;
5114 }
5115 BUG_ON(mddev != bdev->bd_disk->private_data);
5116
5076 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 5117 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
5077 goto out; 5118 goto out;
5078 5119
5079 err = 0; 5120 err = 0;
5080 mddev_get(mddev);
5081 atomic_inc(&mddev->openers); 5121 atomic_inc(&mddev->openers);
5082 mddev_unlock(mddev); 5122 mddev_unlock(mddev);
5083 5123
@@ -6436,11 +6476,8 @@ static __exit void md_exit(void)
6436 unregister_sysctl_table(raid_table_header); 6476 unregister_sysctl_table(raid_table_header);
6437 remove_proc_entry("mdstat", NULL); 6477 remove_proc_entry("mdstat", NULL);
6438 for_each_mddev(mddev, tmp) { 6478 for_each_mddev(mddev, tmp) {
6439 struct gendisk *disk = mddev->gendisk;
6440 if (!disk)
6441 continue;
6442 export_array(mddev); 6479 export_array(mddev);
6443 mddev_put(mddev); 6480 mddev->hold_active = 0;
6444 } 6481 }
6445} 6482}
6446 6483