aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2008-06-28 00:44:04 -0400
committerDan Williams <dan.j.williams@intel.com>2008-06-30 20:18:19 -0400
commitb5470dc5fc18a8ff6517c3bb538d1479e58ecb02 (patch)
tree37b0eb3a4691bdbe58dc5c6c73b2dc8d3925b332
parent1fe797e67fb07d605b82300934d0de67068a0aca (diff)
md: resolve external metadata handling deadlock in md_allow_write
md_allow_write() marks the metadata dirty while holding mddev->lock and then waits for the write to complete. For externally managed metadata this causes a deadlock as userspace needs to take the lock to communicate that the metadata update has completed. Change md_allow_write() in the 'external' case to start the 'mark active' operation and then return -EAGAIN. The expected side effects while waiting for userspace to write 'active' to 'array_state' are holding off reshape (code currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and cause updates to 'raid_disks' to fail. Except for 'stripe_cache_size' changes these failures can be mitigated by coordinating with mdmon. md_write_start() still prevents writes from occurring until the metadata handler has had a chance to take action as it unconditionally waits for MD_CHANGE_CLEAN to be cleared. [neilb@suse.de: return -EAGAIN, try GFP_NOIO] Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r--drivers/md/md.c27
-rw-r--r--drivers/md/raid1.c6
-rw-r--r--drivers/md/raid5.c12
-rw-r--r--include/linux/raid/md.h2
4 files changed, 30 insertions, 17 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index df1230af02cd..43d033d9a05a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4172 char *ptr, *buf = NULL; 4172 char *ptr, *buf = NULL;
4173 int err = -ENOMEM; 4173 int err = -ENOMEM;
4174 4174
4175 md_allow_write(mddev); 4175 if (md_allow_write(mddev))
4176 file = kmalloc(sizeof(*file), GFP_NOIO);
4177 else
4178 file = kmalloc(sizeof(*file), GFP_KERNEL);
4176 4179
4177 file = kmalloc(sizeof(*file), GFP_KERNEL);
4178 if (!file) 4180 if (!file)
4179 goto out; 4181 goto out;
4180 4182
@@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev)
5667 * may proceed without blocking. It is important to call this before 5669 * may proceed without blocking. It is important to call this before
5668 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5670 * attempting a GFP_KERNEL allocation while holding the mddev lock.
5669 * Must be called with mddev_lock held. 5671 * Must be called with mddev_lock held.
5672 *
5673 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
5674 * is dropped, so return -EAGAIN after notifying userspace.
5670 */ 5675 */
5671void md_allow_write(mddev_t *mddev) 5676int md_allow_write(mddev_t *mddev)
5672{ 5677{
5673 if (!mddev->pers) 5678 if (!mddev->pers)
5674 return; 5679 return 0;
5675 if (mddev->ro) 5680 if (mddev->ro)
5676 return; 5681 return 0;
5677 if (!mddev->pers->sync_request) 5682 if (!mddev->pers->sync_request)
5678 return; 5683 return 0;
5679 5684
5680 spin_lock_irq(&mddev->write_lock); 5685 spin_lock_irq(&mddev->write_lock);
5681 if (mddev->in_sync) { 5686 if (mddev->in_sync) {
@@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev)
5686 mddev->safemode = 1; 5691 mddev->safemode = 1;
5687 spin_unlock_irq(&mddev->write_lock); 5692 spin_unlock_irq(&mddev->write_lock);
5688 md_update_sb(mddev, 0); 5693 md_update_sb(mddev, 0);
5689
5690 sysfs_notify(&mddev->kobj, NULL, "array_state"); 5694 sysfs_notify(&mddev->kobj, NULL, "array_state");
5691 /* wait for the dirty state to be recorded in the metadata */
5692 wait_event(mddev->sb_wait,
5693 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5694 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
5695 } else 5695 } else
5696 spin_unlock_irq(&mddev->write_lock); 5696 spin_unlock_irq(&mddev->write_lock);
5697
5698 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
5699 return -EAGAIN;
5700 else
5701 return 0;
5697} 5702}
5698EXPORT_SYMBOL_GPL(md_allow_write); 5703EXPORT_SYMBOL_GPL(md_allow_write);
5699 5704
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f05d5983efb6..491dc2d4ad5f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev)
2136 conf_t *conf = mddev_to_conf(mddev); 2136 conf_t *conf = mddev_to_conf(mddev);
2137 int cnt, raid_disks; 2137 int cnt, raid_disks;
2138 unsigned long flags; 2138 unsigned long flags;
2139 int d, d2; 2139 int d, d2, err;
2140 2140
2141 /* Cannot change chunk_size, layout, or level */ 2141 /* Cannot change chunk_size, layout, or level */
2142 if (mddev->chunk_size != mddev->new_chunk || 2142 if (mddev->chunk_size != mddev->new_chunk ||
@@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev)
2148 return -EINVAL; 2148 return -EINVAL;
2149 } 2149 }
2150 2150
2151 md_allow_write(mddev); 2151 err = md_allow_write(mddev);
2152 if (err)
2153 return err;
2152 2154
2153 raid_disks = mddev->raid_disks + mddev->delta_disks; 2155 raid_disks = mddev->raid_disks + mddev->delta_disks;
2154 2156
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 442622067cae..8f4c70a53210 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
911 struct stripe_head *osh, *nsh; 911 struct stripe_head *osh, *nsh;
912 LIST_HEAD(newstripes); 912 LIST_HEAD(newstripes);
913 struct disk_info *ndisks; 913 struct disk_info *ndisks;
914 int err = 0; 914 int err;
915 struct kmem_cache *sc; 915 struct kmem_cache *sc;
916 int i; 916 int i;
917 917
918 if (newsize <= conf->pool_size) 918 if (newsize <= conf->pool_size)
919 return 0; /* never bother to shrink */ 919 return 0; /* never bother to shrink */
920 920
921 md_allow_write(conf->mddev); 921 err = md_allow_write(conf->mddev);
922 if (err)
923 return err;
922 924
923 /* Step 1 */ 925 /* Step 1 */
924 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 926 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
3843{ 3845{
3844 raid5_conf_t *conf = mddev_to_conf(mddev); 3846 raid5_conf_t *conf = mddev_to_conf(mddev);
3845 unsigned long new; 3847 unsigned long new;
3848 int err;
3849
3846 if (len >= PAGE_SIZE) 3850 if (len >= PAGE_SIZE)
3847 return -EINVAL; 3851 return -EINVAL;
3848 if (!conf) 3852 if (!conf)
@@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
3858 else 3862 else
3859 break; 3863 break;
3860 } 3864 }
3861 md_allow_write(mddev); 3865 err = md_allow_write(mddev);
3866 if (err)
3867 return err;
3862 while (new > conf->max_nr_stripes) { 3868 while (new > conf->max_nr_stripes) {
3863 if (grow_one_stripe(conf)) 3869 if (grow_one_stripe(conf))
3864 conf->max_nr_stripes++; 3870 conf->max_nr_stripes++;
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index b7386ae9d288..dc0e3fcb9f28 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
95 struct page *page, int rw); 95 struct page *page, int rw);
96extern void md_do_sync(mddev_t *mddev); 96extern void md_do_sync(mddev_t *mddev);
97extern void md_new_event(mddev_t *mddev); 97extern void md_new_event(mddev_t *mddev);
98extern void md_allow_write(mddev_t *mddev); 98extern int md_allow_write(mddev_t *mddev);
99extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 99extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
100 100
101#endif /* CONFIG_MD */ 101#endif /* CONFIG_MD */