diff options
author | Dan Williams <dan.j.williams@intel.com> | 2008-06-28 00:44:04 -0400 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2008-06-30 20:18:19 -0400 |
commit | b5470dc5fc18a8ff6517c3bb538d1479e58ecb02 (patch) | |
tree | 37b0eb3a4691bdbe58dc5c6c73b2dc8d3925b332 | |
parent | 1fe797e67fb07d605b82300934d0de67068a0aca (diff) |
md: resolve external metadata handling deadlock in md_allow_write
md_allow_write() marks the metadata dirty while holding mddev->lock and then
waits for the write to complete. For externally managed metadata this causes a
deadlock as userspace needs to take the lock to communicate that the metadata
update has completed.
Change md_allow_write() in the 'external' case to start the 'mark active'
operation and then return -EAGAIN. The expected side effects while waiting for
userspace to write 'active' to 'array_state' are holding off reshape (code
currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to
fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and
cause updates to 'raid_disks' to fail. Except for 'stripe_cache_size' changes
these failures can be mitigated by coordinating with mdmon.
md_write_start() still prevents writes from occurring until the metadata
handler has had a chance to take action as it unconditionally waits for
MD_CHANGE_CLEAN to be cleared.
[neilb@suse.de: return -EAGAIN, try GFP_NOIO]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r-- | drivers/md/md.c | 27 | ||||
-rw-r--r-- | drivers/md/raid1.c | 6 | ||||
-rw-r--r-- | drivers/md/raid5.c | 12 | ||||
-rw-r--r-- | include/linux/raid/md.h | 2 |
4 files changed, 30 insertions, 17 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index df1230af02cd..43d033d9a05a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) | |||
4172 | char *ptr, *buf = NULL; | 4172 | char *ptr, *buf = NULL; |
4173 | int err = -ENOMEM; | 4173 | int err = -ENOMEM; |
4174 | 4174 | ||
4175 | md_allow_write(mddev); | 4175 | if (md_allow_write(mddev)) |
4176 | file = kmalloc(sizeof(*file), GFP_NOIO); | ||
4177 | else | ||
4178 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
4176 | 4179 | ||
4177 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
4178 | if (!file) | 4180 | if (!file) |
4179 | goto out; | 4181 | goto out; |
4180 | 4182 | ||
@@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev) | |||
5667 | * may proceed without blocking. It is important to call this before | 5669 | * may proceed without blocking. It is important to call this before |
5668 | * attempting a GFP_KERNEL allocation while holding the mddev lock. | 5670 | * attempting a GFP_KERNEL allocation while holding the mddev lock. |
5669 | * Must be called with mddev_lock held. | 5671 | * Must be called with mddev_lock held. |
5672 | * | ||
5673 | * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock | ||
5674 | * is dropped, so return -EAGAIN after notifying userspace. | ||
5670 | */ | 5675 | */ |
5671 | void md_allow_write(mddev_t *mddev) | 5676 | int md_allow_write(mddev_t *mddev) |
5672 | { | 5677 | { |
5673 | if (!mddev->pers) | 5678 | if (!mddev->pers) |
5674 | return; | 5679 | return 0; |
5675 | if (mddev->ro) | 5680 | if (mddev->ro) |
5676 | return; | 5681 | return 0; |
5677 | if (!mddev->pers->sync_request) | 5682 | if (!mddev->pers->sync_request) |
5678 | return; | 5683 | return 0; |
5679 | 5684 | ||
5680 | spin_lock_irq(&mddev->write_lock); | 5685 | spin_lock_irq(&mddev->write_lock); |
5681 | if (mddev->in_sync) { | 5686 | if (mddev->in_sync) { |
@@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev) | |||
5686 | mddev->safemode = 1; | 5691 | mddev->safemode = 1; |
5687 | spin_unlock_irq(&mddev->write_lock); | 5692 | spin_unlock_irq(&mddev->write_lock); |
5688 | md_update_sb(mddev, 0); | 5693 | md_update_sb(mddev, 0); |
5689 | |||
5690 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | 5694 | sysfs_notify(&mddev->kobj, NULL, "array_state"); |
5691 | /* wait for the dirty state to be recorded in the metadata */ | ||
5692 | wait_event(mddev->sb_wait, | ||
5693 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && | ||
5694 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
5695 | } else | 5695 | } else |
5696 | spin_unlock_irq(&mddev->write_lock); | 5696 | spin_unlock_irq(&mddev->write_lock); |
5697 | |||
5698 | if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) | ||
5699 | return -EAGAIN; | ||
5700 | else | ||
5701 | return 0; | ||
5697 | } | 5702 | } |
5698 | EXPORT_SYMBOL_GPL(md_allow_write); | 5703 | EXPORT_SYMBOL_GPL(md_allow_write); |
5699 | 5704 | ||
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f05d5983efb6..491dc2d4ad5f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev) | |||
2136 | conf_t *conf = mddev_to_conf(mddev); | 2136 | conf_t *conf = mddev_to_conf(mddev); |
2137 | int cnt, raid_disks; | 2137 | int cnt, raid_disks; |
2138 | unsigned long flags; | 2138 | unsigned long flags; |
2139 | int d, d2; | 2139 | int d, d2, err; |
2140 | 2140 | ||
2141 | /* Cannot change chunk_size, layout, or level */ | 2141 | /* Cannot change chunk_size, layout, or level */ |
2142 | if (mddev->chunk_size != mddev->new_chunk || | 2142 | if (mddev->chunk_size != mddev->new_chunk || |
@@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev) | |||
2148 | return -EINVAL; | 2148 | return -EINVAL; |
2149 | } | 2149 | } |
2150 | 2150 | ||
2151 | md_allow_write(mddev); | 2151 | err = md_allow_write(mddev); |
2152 | if (err) | ||
2153 | return err; | ||
2152 | 2154 | ||
2153 | raid_disks = mddev->raid_disks + mddev->delta_disks; | 2155 | raid_disks = mddev->raid_disks + mddev->delta_disks; |
2154 | 2156 | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 442622067cae..8f4c70a53210 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
911 | struct stripe_head *osh, *nsh; | 911 | struct stripe_head *osh, *nsh; |
912 | LIST_HEAD(newstripes); | 912 | LIST_HEAD(newstripes); |
913 | struct disk_info *ndisks; | 913 | struct disk_info *ndisks; |
914 | int err = 0; | 914 | int err; |
915 | struct kmem_cache *sc; | 915 | struct kmem_cache *sc; |
916 | int i; | 916 | int i; |
917 | 917 | ||
918 | if (newsize <= conf->pool_size) | 918 | if (newsize <= conf->pool_size) |
919 | return 0; /* never bother to shrink */ | 919 | return 0; /* never bother to shrink */ |
920 | 920 | ||
921 | md_allow_write(conf->mddev); | 921 | err = md_allow_write(conf->mddev); |
922 | if (err) | ||
923 | return err; | ||
922 | 924 | ||
923 | /* Step 1 */ | 925 | /* Step 1 */ |
924 | sc = kmem_cache_create(conf->cache_name[1-conf->active_name], | 926 | sc = kmem_cache_create(conf->cache_name[1-conf->active_name], |
@@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | |||
3843 | { | 3845 | { |
3844 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3846 | raid5_conf_t *conf = mddev_to_conf(mddev); |
3845 | unsigned long new; | 3847 | unsigned long new; |
3848 | int err; | ||
3849 | |||
3846 | if (len >= PAGE_SIZE) | 3850 | if (len >= PAGE_SIZE) |
3847 | return -EINVAL; | 3851 | return -EINVAL; |
3848 | if (!conf) | 3852 | if (!conf) |
@@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | |||
3858 | else | 3862 | else |
3859 | break; | 3863 | break; |
3860 | } | 3864 | } |
3861 | md_allow_write(mddev); | 3865 | err = md_allow_write(mddev); |
3866 | if (err) | ||
3867 | return err; | ||
3862 | while (new > conf->max_nr_stripes) { | 3868 | while (new > conf->max_nr_stripes) { |
3863 | if (grow_one_stripe(conf)) | 3869 | if (grow_one_stripe(conf)) |
3864 | conf->max_nr_stripes++; | 3870 | conf->max_nr_stripes++; |
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b7386ae9d288..dc0e3fcb9f28 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h | |||
@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, | |||
95 | struct page *page, int rw); | 95 | struct page *page, int rw); |
96 | extern void md_do_sync(mddev_t *mddev); | 96 | extern void md_do_sync(mddev_t *mddev); |
97 | extern void md_new_event(mddev_t *mddev); | 97 | extern void md_new_event(mddev_t *mddev); |
98 | extern void md_allow_write(mddev_t *mddev); | 98 | extern int md_allow_write(mddev_t *mddev); |
99 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 99 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
100 | 100 | ||
101 | #endif /* CONFIG_MD */ | 101 | #endif /* CONFIG_MD */ |