aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 14:05:49 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 14:05:49 -0500
commit5d8e7fb6916556e9b476de33404e8c9e2c9aee61 (patch)
tree2f2e1c0f0df579a221e3bc99e5ccf5ddacfcc27a /drivers/md
parent87c9172f71e3f729729aad27fa6592bb795137fd (diff)
parent53a6ab4d3f6d6dc87ec8f14998b4b5536ee2968c (diff)
Merge tag 'md/3.20' of git://neil.brown.name/md
Pull md updates from Neil Brown: - assorted locking changes so that access to /proc/mdstat and much of /sys/block/mdXX/md/* is protected by a spinlock rather than a mutex and will never block indefinitely. - Make an 'if' condition in RAID5 - which has been implicated in recent bugs - more readable. - misc minor fixes * tag 'md/3.20' of git://neil.brown.name/md: (28 commits) md/raid10: fix conversion from RAID0 to RAID10 md: wakeup thread upon rdev_dec_pending() md: make reconfig_mutex optional for writes to md sysfs files. md: move mddev_lock and related to md.h md: use mddev->lock to protect updates to resync_{min,max}. md: minor cleanup in safe_delay_store. md: move GET_BITMAP_FILE ioctl out from mddev_lock. md: tidy up set_bitmap_file md: remove unnecessary 'buf' from get_bitmap_file. md: remove mddev_lock from rdev_attr_show() md: remove mddev_lock() from md_attr_show() md/raid5: use ->lock to protect accessing raid5 sysfs attributes. md: remove need for mddev_lock() in md_seq_show() md/bitmap: protect clearing of ->bitmap by mddev->lock md: protect ->pers changes with mddev->lock md: level_store: group all important changes into one place. md: rename ->stop to ->free md: split detach operation out from ->stop. md/linear: remove rcu protections in favour of suspend/resume md: make merge_bvec_fn more robust in face of personality changes. ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c15
-rw-r--r--drivers/md/dm-raid.c8
-rw-r--r--drivers/md/faulty.c8
-rw-r--r--drivers/md/linear.c67
-rw-r--r--drivers/md/md.c816
-rw-r--r--drivers/md/md.h57
-rw-r--r--drivers/md/multipath.c22
-rw-r--r--drivers/md/raid0.c29
-rw-r--r--drivers/md/raid1.c52
-rw-r--r--drivers/md/raid1.h3
-rw-r--r--drivers/md/raid10.c49
-rw-r--r--drivers/md/raid10.h3
-rw-r--r--drivers/md/raid5.c334
-rw-r--r--drivers/md/raid5.h1
14 files changed, 858 insertions, 606 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1695ee5f3ffc..3a5767968ba0 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1619,7 +1619,9 @@ void bitmap_destroy(struct mddev *mddev)
1619 return; 1619 return;
1620 1620
1621 mutex_lock(&mddev->bitmap_info.mutex); 1621 mutex_lock(&mddev->bitmap_info.mutex);
1622 spin_lock(&mddev->lock);
1622 mddev->bitmap = NULL; /* disconnect from the md device */ 1623 mddev->bitmap = NULL; /* disconnect from the md device */
1624 spin_unlock(&mddev->lock);
1623 mutex_unlock(&mddev->bitmap_info.mutex); 1625 mutex_unlock(&mddev->bitmap_info.mutex);
1624 if (mddev->thread) 1626 if (mddev->thread)
1625 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1627 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
@@ -2209,11 +2211,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2209static ssize_t can_clear_show(struct mddev *mddev, char *page) 2211static ssize_t can_clear_show(struct mddev *mddev, char *page)
2210{ 2212{
2211 int len; 2213 int len;
2214 spin_lock(&mddev->lock);
2212 if (mddev->bitmap) 2215 if (mddev->bitmap)
2213 len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? 2216 len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
2214 "false" : "true")); 2217 "false" : "true"));
2215 else 2218 else
2216 len = sprintf(page, "\n"); 2219 len = sprintf(page, "\n");
2220 spin_unlock(&mddev->lock);
2217 return len; 2221 return len;
2218} 2222}
2219 2223
@@ -2238,10 +2242,15 @@ __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
2238static ssize_t 2242static ssize_t
2239behind_writes_used_show(struct mddev *mddev, char *page) 2243behind_writes_used_show(struct mddev *mddev, char *page)
2240{ 2244{
2245 ssize_t ret;
2246 spin_lock(&mddev->lock);
2241 if (mddev->bitmap == NULL) 2247 if (mddev->bitmap == NULL)
2242 return sprintf(page, "0\n"); 2248 ret = sprintf(page, "0\n");
2243 return sprintf(page, "%lu\n", 2249 else
2244 mddev->bitmap->behind_writes_used); 2250 ret = sprintf(page, "%lu\n",
2251 mddev->bitmap->behind_writes_used);
2252 spin_unlock(&mddev->lock);
2253 return ret;
2245} 2254}
2246 2255
2247static ssize_t 2256static ssize_t
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 07c0fa0fa284..777d9ba2acad 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -746,13 +746,7 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
746{ 746{
747 struct raid_set *rs = container_of(cb, struct raid_set, callbacks); 747 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
748 748
749 if (rs->raid_type->level == 1) 749 return mddev_congested(&rs->md, bits);
750 return md_raid1_congested(&rs->md, bits);
751
752 if (rs->raid_type->level == 10)
753 return md_raid10_congested(&rs->md, bits);
754
755 return md_raid5_congested(&rs->md, bits);
756} 750}
757 751
758/* 752/*
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index e8b4574956c7..1277eb26b58a 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -332,13 +332,11 @@ static int run(struct mddev *mddev)
332 return 0; 332 return 0;
333} 333}
334 334
335static int stop(struct mddev *mddev) 335static void faulty_free(struct mddev *mddev, void *priv)
336{ 336{
337 struct faulty_conf *conf = mddev->private; 337 struct faulty_conf *conf = priv;
338 338
339 kfree(conf); 339 kfree(conf);
340 mddev->private = NULL;
341 return 0;
342} 340}
343 341
344static struct md_personality faulty_personality = 342static struct md_personality faulty_personality =
@@ -348,7 +346,7 @@ static struct md_personality faulty_personality =
348 .owner = THIS_MODULE, 346 .owner = THIS_MODULE,
349 .make_request = make_request, 347 .make_request = make_request,
350 .run = run, 348 .run = run,
351 .stop = stop, 349 .free = faulty_free,
352 .status = status, 350 .status = status,
353 .check_reshape = reshape, 351 .check_reshape = reshape,
354 .size = faulty_size, 352 .size = faulty_size,
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 64713b77df1c..fa7d577f3d12 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -34,7 +34,7 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
34 34
35 lo = 0; 35 lo = 0;
36 hi = mddev->raid_disks - 1; 36 hi = mddev->raid_disks - 1;
37 conf = rcu_dereference(mddev->private); 37 conf = mddev->private;
38 38
39 /* 39 /*
40 * Binary Search 40 * Binary Search
@@ -60,18 +60,16 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
60 * 60 *
61 * Return amount of bytes we can take at this offset 61 * Return amount of bytes we can take at this offset
62 */ 62 */
63static int linear_mergeable_bvec(struct request_queue *q, 63static int linear_mergeable_bvec(struct mddev *mddev,
64 struct bvec_merge_data *bvm, 64 struct bvec_merge_data *bvm,
65 struct bio_vec *biovec) 65 struct bio_vec *biovec)
66{ 66{
67 struct mddev *mddev = q->queuedata;
68 struct dev_info *dev0; 67 struct dev_info *dev0;
69 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; 68 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
70 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 69 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
71 int maxbytes = biovec->bv_len; 70 int maxbytes = biovec->bv_len;
72 struct request_queue *subq; 71 struct request_queue *subq;
73 72
74 rcu_read_lock();
75 dev0 = which_dev(mddev, sector); 73 dev0 = which_dev(mddev, sector);
76 maxsectors = dev0->end_sector - sector; 74 maxsectors = dev0->end_sector - sector;
77 subq = bdev_get_queue(dev0->rdev->bdev); 75 subq = bdev_get_queue(dev0->rdev->bdev);
@@ -81,7 +79,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
81 maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, 79 maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
82 biovec)); 80 biovec));
83 } 81 }
84 rcu_read_unlock();
85 82
86 if (maxsectors < bio_sectors) 83 if (maxsectors < bio_sectors)
87 maxsectors = 0; 84 maxsectors = 0;
@@ -97,24 +94,18 @@ static int linear_mergeable_bvec(struct request_queue *q,
97 return maxsectors << 9; 94 return maxsectors << 9;
98} 95}
99 96
100static int linear_congested(void *data, int bits) 97static int linear_congested(struct mddev *mddev, int bits)
101{ 98{
102 struct mddev *mddev = data;
103 struct linear_conf *conf; 99 struct linear_conf *conf;
104 int i, ret = 0; 100 int i, ret = 0;
105 101
106 if (mddev_congested(mddev, bits)) 102 conf = mddev->private;
107 return 1;
108
109 rcu_read_lock();
110 conf = rcu_dereference(mddev->private);
111 103
112 for (i = 0; i < mddev->raid_disks && !ret ; i++) { 104 for (i = 0; i < mddev->raid_disks && !ret ; i++) {
113 struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); 105 struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
114 ret |= bdi_congested(&q->backing_dev_info, bits); 106 ret |= bdi_congested(&q->backing_dev_info, bits);
115 } 107 }
116 108
117 rcu_read_unlock();
118 return ret; 109 return ret;
119} 110}
120 111
@@ -123,12 +114,10 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk
123 struct linear_conf *conf; 114 struct linear_conf *conf;
124 sector_t array_sectors; 115 sector_t array_sectors;
125 116
126 rcu_read_lock(); 117 conf = mddev->private;
127 conf = rcu_dereference(mddev->private);
128 WARN_ONCE(sectors || raid_disks, 118 WARN_ONCE(sectors || raid_disks,
129 "%s does not support generic reshape\n", __func__); 119 "%s does not support generic reshape\n", __func__);
130 array_sectors = conf->array_sectors; 120 array_sectors = conf->array_sectors;
131 rcu_read_unlock();
132 121
133 return array_sectors; 122 return array_sectors;
134} 123}
@@ -217,10 +206,6 @@ static int linear_run (struct mddev *mddev)
217 mddev->private = conf; 206 mddev->private = conf;
218 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 207 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
219 208
220 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
221 mddev->queue->backing_dev_info.congested_fn = linear_congested;
222 mddev->queue->backing_dev_info.congested_data = mddev;
223
224 ret = md_integrity_register(mddev); 209 ret = md_integrity_register(mddev);
225 if (ret) { 210 if (ret) {
226 kfree(conf); 211 kfree(conf);
@@ -252,38 +237,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
252 if (!newconf) 237 if (!newconf)
253 return -ENOMEM; 238 return -ENOMEM;
254 239
255 oldconf = rcu_dereference_protected(mddev->private, 240 mddev_suspend(mddev);
256 lockdep_is_held( 241 oldconf = mddev->private;
257 &mddev->reconfig_mutex));
258 mddev->raid_disks++; 242 mddev->raid_disks++;
259 rcu_assign_pointer(mddev->private, newconf); 243 mddev->private = newconf;
260 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 244 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
261 set_capacity(mddev->gendisk, mddev->array_sectors); 245 set_capacity(mddev->gendisk, mddev->array_sectors);
246 mddev_resume(mddev);
262 revalidate_disk(mddev->gendisk); 247 revalidate_disk(mddev->gendisk);
263 kfree_rcu(oldconf, rcu); 248 kfree(oldconf);
264 return 0; 249 return 0;
265} 250}
266 251
267static int linear_stop (struct mddev *mddev) 252static void linear_free(struct mddev *mddev, void *priv)
268{ 253{
269 struct linear_conf *conf = 254 struct linear_conf *conf = priv;
270 rcu_dereference_protected(mddev->private,
271 lockdep_is_held(
272 &mddev->reconfig_mutex));
273 255
274 /*
275 * We do not require rcu protection here since
276 * we hold reconfig_mutex for both linear_add and
277 * linear_stop, so they cannot race.
278 * We should make sure any old 'conf's are properly
279 * freed though.
280 */
281 rcu_barrier();
282 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
283 kfree(conf); 256 kfree(conf);
284 mddev->private = NULL;
285
286 return 0;
287} 257}
288 258
289static void linear_make_request(struct mddev *mddev, struct bio *bio) 259static void linear_make_request(struct mddev *mddev, struct bio *bio)
@@ -299,16 +269,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
299 } 269 }
300 270
301 do { 271 do {
302 rcu_read_lock();
303
304 tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); 272 tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
305 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; 273 start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
306 end_sector = tmp_dev->end_sector; 274 end_sector = tmp_dev->end_sector;
307 data_offset = tmp_dev->rdev->data_offset; 275 data_offset = tmp_dev->rdev->data_offset;
308 bio->bi_bdev = tmp_dev->rdev->bdev; 276 bio->bi_bdev = tmp_dev->rdev->bdev;
309 277
310 rcu_read_unlock();
311
312 if (unlikely(bio->bi_iter.bi_sector >= end_sector || 278 if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
313 bio->bi_iter.bi_sector < start_sector)) 279 bio->bi_iter.bi_sector < start_sector))
314 goto out_of_bounds; 280 goto out_of_bounds;
@@ -355,6 +321,10 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev)
355 seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); 321 seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
356} 322}
357 323
324static void linear_quiesce(struct mddev *mddev, int state)
325{
326}
327
358static struct md_personality linear_personality = 328static struct md_personality linear_personality =
359{ 329{
360 .name = "linear", 330 .name = "linear",
@@ -362,10 +332,13 @@ static struct md_personality linear_personality =
362 .owner = THIS_MODULE, 332 .owner = THIS_MODULE,
363 .make_request = linear_make_request, 333 .make_request = linear_make_request,
364 .run = linear_run, 334 .run = linear_run,
365 .stop = linear_stop, 335 .free = linear_free,
366 .status = linear_status, 336 .status = linear_status,
367 .hot_add_disk = linear_add, 337 .hot_add_disk = linear_add,
368 .size = linear_size, 338 .size = linear_size,
339 .quiesce = linear_quiesce,
340 .congested = linear_congested,
341 .mergeable_bvec = linear_mergeable_bvec,
369}; 342};
370 343
371static int __init linear_init (void) 344static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 709755fb6d7b..c8d2bac4e28b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -72,6 +72,7 @@ static struct workqueue_struct *md_misc_wq;
72 72
73static int remove_and_add_spares(struct mddev *mddev, 73static int remove_and_add_spares(struct mddev *mddev,
74 struct md_rdev *this); 74 struct md_rdev *this);
75static void mddev_detach(struct mddev *mddev);
75 76
76/* 77/*
77 * Default number of read corrections we'll attempt on an rdev 78 * Default number of read corrections we'll attempt on an rdev
@@ -292,8 +293,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
292/* mddev_suspend makes sure no new requests are submitted 293/* mddev_suspend makes sure no new requests are submitted
293 * to the device, and that any requests that have been submitted 294 * to the device, and that any requests that have been submitted
294 * are completely handled. 295 * are completely handled.
295 * Once ->stop is called and completes, the module will be completely 296 * Once mddev_detach() is called and completes, the module will be
296 * unused. 297 * completely unused.
297 */ 298 */
298void mddev_suspend(struct mddev *mddev) 299void mddev_suspend(struct mddev *mddev)
299{ 300{
@@ -321,10 +322,47 @@ EXPORT_SYMBOL_GPL(mddev_resume);
321 322
322int mddev_congested(struct mddev *mddev, int bits) 323int mddev_congested(struct mddev *mddev, int bits)
323{ 324{
324 return mddev->suspended; 325 struct md_personality *pers = mddev->pers;
326 int ret = 0;
327
328 rcu_read_lock();
329 if (mddev->suspended)
330 ret = 1;
331 else if (pers && pers->congested)
332 ret = pers->congested(mddev, bits);
333 rcu_read_unlock();
334 return ret;
335}
336EXPORT_SYMBOL_GPL(mddev_congested);
337static int md_congested(void *data, int bits)
338{
339 struct mddev *mddev = data;
340 return mddev_congested(mddev, bits);
325} 341}
326EXPORT_SYMBOL(mddev_congested);
327 342
343static int md_mergeable_bvec(struct request_queue *q,
344 struct bvec_merge_data *bvm,
345 struct bio_vec *biovec)
346{
347 struct mddev *mddev = q->queuedata;
348 int ret;
349 rcu_read_lock();
350 if (mddev->suspended) {
351 /* Must always allow one vec */
352 if (bvm->bi_size == 0)
353 ret = biovec->bv_len;
354 else
355 ret = 0;
356 } else {
357 struct md_personality *pers = mddev->pers;
358 if (pers && pers->mergeable_bvec)
359 ret = pers->mergeable_bvec(mddev, bvm, biovec);
360 else
361 ret = biovec->bv_len;
362 }
363 rcu_read_unlock();
364 return ret;
365}
328/* 366/*
329 * Generic flush handling for md 367 * Generic flush handling for md
330 */ 368 */
@@ -397,12 +435,12 @@ static void md_submit_flush_data(struct work_struct *ws)
397 435
398void md_flush_request(struct mddev *mddev, struct bio *bio) 436void md_flush_request(struct mddev *mddev, struct bio *bio)
399{ 437{
400 spin_lock_irq(&mddev->write_lock); 438 spin_lock_irq(&mddev->lock);
401 wait_event_lock_irq(mddev->sb_wait, 439 wait_event_lock_irq(mddev->sb_wait,
402 !mddev->flush_bio, 440 !mddev->flush_bio,
403 mddev->write_lock); 441 mddev->lock);
404 mddev->flush_bio = bio; 442 mddev->flush_bio = bio;
405 spin_unlock_irq(&mddev->write_lock); 443 spin_unlock_irq(&mddev->lock);
406 444
407 INIT_WORK(&mddev->flush_work, submit_flushes); 445 INIT_WORK(&mddev->flush_work, submit_flushes);
408 queue_work(md_wq, &mddev->flush_work); 446 queue_work(md_wq, &mddev->flush_work);
@@ -465,7 +503,7 @@ void mddev_init(struct mddev *mddev)
465 atomic_set(&mddev->active, 1); 503 atomic_set(&mddev->active, 1);
466 atomic_set(&mddev->openers, 0); 504 atomic_set(&mddev->openers, 0);
467 atomic_set(&mddev->active_io, 0); 505 atomic_set(&mddev->active_io, 0);
468 spin_lock_init(&mddev->write_lock); 506 spin_lock_init(&mddev->lock);
469 atomic_set(&mddev->flush_pending, 0); 507 atomic_set(&mddev->flush_pending, 0);
470 init_waitqueue_head(&mddev->sb_wait); 508 init_waitqueue_head(&mddev->sb_wait);
471 init_waitqueue_head(&mddev->recovery_wait); 509 init_waitqueue_head(&mddev->recovery_wait);
@@ -552,32 +590,9 @@ static struct mddev *mddev_find(dev_t unit)
552 goto retry; 590 goto retry;
553} 591}
554 592
555static inline int __must_check mddev_lock(struct mddev *mddev)
556{
557 return mutex_lock_interruptible(&mddev->reconfig_mutex);
558}
559
560/* Sometimes we need to take the lock in a situation where
561 * failure due to interrupts is not acceptable.
562 */
563static inline void mddev_lock_nointr(struct mddev *mddev)
564{
565 mutex_lock(&mddev->reconfig_mutex);
566}
567
568static inline int mddev_is_locked(struct mddev *mddev)
569{
570 return mutex_is_locked(&mddev->reconfig_mutex);
571}
572
573static inline int mddev_trylock(struct mddev *mddev)
574{
575 return mutex_trylock(&mddev->reconfig_mutex);
576}
577
578static struct attribute_group md_redundancy_group; 593static struct attribute_group md_redundancy_group;
579 594
580static void mddev_unlock(struct mddev *mddev) 595void mddev_unlock(struct mddev *mddev)
581{ 596{
582 if (mddev->to_remove) { 597 if (mddev->to_remove) {
583 /* These cannot be removed under reconfig_mutex as 598 /* These cannot be removed under reconfig_mutex as
@@ -619,6 +634,7 @@ static void mddev_unlock(struct mddev *mddev)
619 md_wakeup_thread(mddev->thread); 634 md_wakeup_thread(mddev->thread);
620 spin_unlock(&pers_lock); 635 spin_unlock(&pers_lock);
621} 636}
637EXPORT_SYMBOL_GPL(mddev_unlock);
622 638
623static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) 639static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
624{ 640{
@@ -2230,7 +2246,7 @@ repeat:
2230 return; 2246 return;
2231 } 2247 }
2232 2248
2233 spin_lock_irq(&mddev->write_lock); 2249 spin_lock(&mddev->lock);
2234 2250
2235 mddev->utime = get_seconds(); 2251 mddev->utime = get_seconds();
2236 2252
@@ -2287,7 +2303,7 @@ repeat:
2287 } 2303 }
2288 2304
2289 sync_sbs(mddev, nospares); 2305 sync_sbs(mddev, nospares);
2290 spin_unlock_irq(&mddev->write_lock); 2306 spin_unlock(&mddev->lock);
2291 2307
2292 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2308 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2293 mdname(mddev), mddev->in_sync); 2309 mdname(mddev), mddev->in_sync);
@@ -2326,15 +2342,15 @@ repeat:
2326 md_super_wait(mddev); 2342 md_super_wait(mddev);
2327 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2343 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2328 2344
2329 spin_lock_irq(&mddev->write_lock); 2345 spin_lock(&mddev->lock);
2330 if (mddev->in_sync != sync_req || 2346 if (mddev->in_sync != sync_req ||
2331 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2347 test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2332 /* have to write it out again */ 2348 /* have to write it out again */
2333 spin_unlock_irq(&mddev->write_lock); 2349 spin_unlock(&mddev->lock);
2334 goto repeat; 2350 goto repeat;
2335 } 2351 }
2336 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2352 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2337 spin_unlock_irq(&mddev->write_lock); 2353 spin_unlock(&mddev->lock);
2338 wake_up(&mddev->sb_wait); 2354 wake_up(&mddev->sb_wait);
2339 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2355 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2340 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2356 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -2381,40 +2397,41 @@ state_show(struct md_rdev *rdev, char *page)
2381{ 2397{
2382 char *sep = ""; 2398 char *sep = "";
2383 size_t len = 0; 2399 size_t len = 0;
2400 unsigned long flags = ACCESS_ONCE(rdev->flags);
2384 2401
2385 if (test_bit(Faulty, &rdev->flags) || 2402 if (test_bit(Faulty, &flags) ||
2386 rdev->badblocks.unacked_exist) { 2403 rdev->badblocks.unacked_exist) {
2387 len+= sprintf(page+len, "%sfaulty",sep); 2404 len+= sprintf(page+len, "%sfaulty",sep);
2388 sep = ","; 2405 sep = ",";
2389 } 2406 }
2390 if (test_bit(In_sync, &rdev->flags)) { 2407 if (test_bit(In_sync, &flags)) {
2391 len += sprintf(page+len, "%sin_sync",sep); 2408 len += sprintf(page+len, "%sin_sync",sep);
2392 sep = ","; 2409 sep = ",";
2393 } 2410 }
2394 if (test_bit(WriteMostly, &rdev->flags)) { 2411 if (test_bit(WriteMostly, &flags)) {
2395 len += sprintf(page+len, "%swrite_mostly",sep); 2412 len += sprintf(page+len, "%swrite_mostly",sep);
2396 sep = ","; 2413 sep = ",";
2397 } 2414 }
2398 if (test_bit(Blocked, &rdev->flags) || 2415 if (test_bit(Blocked, &flags) ||
2399 (rdev->badblocks.unacked_exist 2416 (rdev->badblocks.unacked_exist
2400 && !test_bit(Faulty, &rdev->flags))) { 2417 && !test_bit(Faulty, &flags))) {
2401 len += sprintf(page+len, "%sblocked", sep); 2418 len += sprintf(page+len, "%sblocked", sep);
2402 sep = ","; 2419 sep = ",";
2403 } 2420 }
2404 if (!test_bit(Faulty, &rdev->flags) && 2421 if (!test_bit(Faulty, &flags) &&
2405 !test_bit(In_sync, &rdev->flags)) { 2422 !test_bit(In_sync, &flags)) {
2406 len += sprintf(page+len, "%sspare", sep); 2423 len += sprintf(page+len, "%sspare", sep);
2407 sep = ","; 2424 sep = ",";
2408 } 2425 }
2409 if (test_bit(WriteErrorSeen, &rdev->flags)) { 2426 if (test_bit(WriteErrorSeen, &flags)) {
2410 len += sprintf(page+len, "%swrite_error", sep); 2427 len += sprintf(page+len, "%swrite_error", sep);
2411 sep = ","; 2428 sep = ",";
2412 } 2429 }
2413 if (test_bit(WantReplacement, &rdev->flags)) { 2430 if (test_bit(WantReplacement, &flags)) {
2414 len += sprintf(page+len, "%swant_replacement", sep); 2431 len += sprintf(page+len, "%swant_replacement", sep);
2415 sep = ","; 2432 sep = ",";
2416 } 2433 }
2417 if (test_bit(Replacement, &rdev->flags)) { 2434 if (test_bit(Replacement, &flags)) {
2418 len += sprintf(page+len, "%sreplacement", sep); 2435 len += sprintf(page+len, "%sreplacement", sep);
2419 sep = ","; 2436 sep = ",";
2420 } 2437 }
@@ -2927,21 +2944,12 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2927{ 2944{
2928 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2945 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2929 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 2946 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
2930 struct mddev *mddev = rdev->mddev;
2931 ssize_t rv;
2932 2947
2933 if (!entry->show) 2948 if (!entry->show)
2934 return -EIO; 2949 return -EIO;
2935 2950 if (!rdev->mddev)
2936 rv = mddev ? mddev_lock(mddev) : -EBUSY; 2951 return -EBUSY;
2937 if (!rv) { 2952 return entry->show(rdev, page);
2938 if (rdev->mddev == NULL)
2939 rv = -EBUSY;
2940 else
2941 rv = entry->show(rdev, page);
2942 mddev_unlock(mddev);
2943 }
2944 return rv;
2945} 2953}
2946 2954
2947static ssize_t 2955static ssize_t
@@ -3212,11 +3220,13 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3212 mddev->safemode_delay = 0; 3220 mddev->safemode_delay = 0;
3213 else { 3221 else {
3214 unsigned long old_delay = mddev->safemode_delay; 3222 unsigned long old_delay = mddev->safemode_delay;
3215 mddev->safemode_delay = (msec*HZ)/1000; 3223 unsigned long new_delay = (msec*HZ)/1000;
3216 if (mddev->safemode_delay == 0) 3224
3217 mddev->safemode_delay = 1; 3225 if (new_delay == 0)
3218 if (mddev->safemode_delay < old_delay || old_delay == 0) 3226 new_delay = 1;
3219 md_safemode_timeout((unsigned long)mddev); 3227 mddev->safemode_delay = new_delay;
3228 if (new_delay < old_delay || old_delay == 0)
3229 mod_timer(&mddev->safemode_timer, jiffies+1);
3220 } 3230 }
3221 return len; 3231 return len;
3222} 3232}
@@ -3226,41 +3236,52 @@ __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3226static ssize_t 3236static ssize_t
3227level_show(struct mddev *mddev, char *page) 3237level_show(struct mddev *mddev, char *page)
3228{ 3238{
3229 struct md_personality *p = mddev->pers; 3239 struct md_personality *p;
3240 int ret;
3241 spin_lock(&mddev->lock);
3242 p = mddev->pers;
3230 if (p) 3243 if (p)
3231 return sprintf(page, "%s\n", p->name); 3244 ret = sprintf(page, "%s\n", p->name);
3232 else if (mddev->clevel[0]) 3245 else if (mddev->clevel[0])
3233 return sprintf(page, "%s\n", mddev->clevel); 3246 ret = sprintf(page, "%s\n", mddev->clevel);
3234 else if (mddev->level != LEVEL_NONE) 3247 else if (mddev->level != LEVEL_NONE)
3235 return sprintf(page, "%d\n", mddev->level); 3248 ret = sprintf(page, "%d\n", mddev->level);
3236 else 3249 else
3237 return 0; 3250 ret = 0;
3251 spin_unlock(&mddev->lock);
3252 return ret;
3238} 3253}
3239 3254
3240static ssize_t 3255static ssize_t
3241level_store(struct mddev *mddev, const char *buf, size_t len) 3256level_store(struct mddev *mddev, const char *buf, size_t len)
3242{ 3257{
3243 char clevel[16]; 3258 char clevel[16];
3244 ssize_t rv = len; 3259 ssize_t rv;
3245 struct md_personality *pers; 3260 size_t slen = len;
3261 struct md_personality *pers, *oldpers;
3246 long level; 3262 long level;
3247 void *priv; 3263 void *priv, *oldpriv;
3248 struct md_rdev *rdev; 3264 struct md_rdev *rdev;
3249 3265
3266 if (slen == 0 || slen >= sizeof(clevel))
3267 return -EINVAL;
3268
3269 rv = mddev_lock(mddev);
3270 if (rv)
3271 return rv;
3272
3250 if (mddev->pers == NULL) { 3273 if (mddev->pers == NULL) {
3251 if (len == 0) 3274 strncpy(mddev->clevel, buf, slen);
3252 return 0; 3275 if (mddev->clevel[slen-1] == '\n')
3253 if (len >= sizeof(mddev->clevel)) 3276 slen--;
3254 return -ENOSPC; 3277 mddev->clevel[slen] = 0;
3255 strncpy(mddev->clevel, buf, len);
3256 if (mddev->clevel[len-1] == '\n')
3257 len--;
3258 mddev->clevel[len] = 0;
3259 mddev->level = LEVEL_NONE; 3278 mddev->level = LEVEL_NONE;
3260 return rv; 3279 rv = len;
3280 goto out_unlock;
3261 } 3281 }
3282 rv = -EROFS;
3262 if (mddev->ro) 3283 if (mddev->ro)
3263 return -EROFS; 3284 goto out_unlock;
3264 3285
3265 /* request to change the personality. Need to ensure: 3286 /* request to change the personality. Need to ensure:
3266 * - array is not engaged in resync/recovery/reshape 3287 * - array is not engaged in resync/recovery/reshape
@@ -3268,25 +3289,25 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3268 * - new personality will access other array. 3289 * - new personality will access other array.
3269 */ 3290 */
3270 3291
3292 rv = -EBUSY;
3271 if (mddev->sync_thread || 3293 if (mddev->sync_thread ||
3272 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3294 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3273 mddev->reshape_position != MaxSector || 3295 mddev->reshape_position != MaxSector ||
3274 mddev->sysfs_active) 3296 mddev->sysfs_active)
3275 return -EBUSY; 3297 goto out_unlock;
3276 3298
3299 rv = -EINVAL;
3277 if (!mddev->pers->quiesce) { 3300 if (!mddev->pers->quiesce) {
3278 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 3301 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3279 mdname(mddev), mddev->pers->name); 3302 mdname(mddev), mddev->pers->name);
3280 return -EINVAL; 3303 goto out_unlock;
3281 } 3304 }
3282 3305
3283 /* Now find the new personality */ 3306 /* Now find the new personality */
3284 if (len == 0 || len >= sizeof(clevel)) 3307 strncpy(clevel, buf, slen);
3285 return -EINVAL; 3308 if (clevel[slen-1] == '\n')
3286 strncpy(clevel, buf, len); 3309 slen--;
3287 if (clevel[len-1] == '\n') 3310 clevel[slen] = 0;
3288 len--;
3289 clevel[len] = 0;
3290 if (kstrtol(clevel, 10, &level)) 3311 if (kstrtol(clevel, 10, &level))
3291 level = LEVEL_NONE; 3312 level = LEVEL_NONE;
3292 3313
@@ -3297,20 +3318,23 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3297 if (!pers || !try_module_get(pers->owner)) { 3318 if (!pers || !try_module_get(pers->owner)) {
3298 spin_unlock(&pers_lock); 3319 spin_unlock(&pers_lock);
3299 printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 3320 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
3300 return -EINVAL; 3321 rv = -EINVAL;
3322 goto out_unlock;
3301 } 3323 }
3302 spin_unlock(&pers_lock); 3324 spin_unlock(&pers_lock);
3303 3325
3304 if (pers == mddev->pers) { 3326 if (pers == mddev->pers) {
3305 /* Nothing to do! */ 3327 /* Nothing to do! */
3306 module_put(pers->owner); 3328 module_put(pers->owner);
3307 return rv; 3329 rv = len;
3330 goto out_unlock;
3308 } 3331 }
3309 if (!pers->takeover) { 3332 if (!pers->takeover) {
3310 module_put(pers->owner); 3333 module_put(pers->owner);
3311 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3334 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
3312 mdname(mddev), clevel); 3335 mdname(mddev), clevel);
3313 return -EINVAL; 3336 rv = -EINVAL;
3337 goto out_unlock;
3314 } 3338 }
3315 3339
3316 rdev_for_each(rdev, mddev) 3340 rdev_for_each(rdev, mddev)
@@ -3330,30 +3354,29 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3330 module_put(pers->owner); 3354 module_put(pers->owner);
3331 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3355 printk(KERN_WARNING "md: %s: %s would not accept array\n",
3332 mdname(mddev), clevel); 3356 mdname(mddev), clevel);
3333 return PTR_ERR(priv); 3357 rv = PTR_ERR(priv);
3358 goto out_unlock;
3334 } 3359 }
3335 3360
3336 /* Looks like we have a winner */ 3361 /* Looks like we have a winner */
3337 mddev_suspend(mddev); 3362 mddev_suspend(mddev);
3338 mddev->pers->stop(mddev); 3363 mddev_detach(mddev);
3339 3364
3340 if (mddev->pers->sync_request == NULL && 3365 spin_lock(&mddev->lock);
3341 pers->sync_request != NULL) { 3366 oldpers = mddev->pers;
3342 /* need to add the md_redundancy_group */ 3367 oldpriv = mddev->private;
3343 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3368 mddev->pers = pers;
3344 printk(KERN_WARNING 3369 mddev->private = priv;
3345 "md: cannot register extra attributes for %s\n", 3370 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3346 mdname(mddev)); 3371 mddev->level = mddev->new_level;
3347 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3372 mddev->layout = mddev->new_layout;
3348 } 3373 mddev->chunk_sectors = mddev->new_chunk_sectors;
3349 if (mddev->pers->sync_request != NULL && 3374 mddev->delta_disks = 0;
3350 pers->sync_request == NULL) { 3375 mddev->reshape_backwards = 0;
3351 /* need to remove the md_redundancy_group */ 3376 mddev->degraded = 0;
3352 if (mddev->to_remove == NULL) 3377 spin_unlock(&mddev->lock);
3353 mddev->to_remove = &md_redundancy_group;
3354 }
3355 3378
3356 if (mddev->pers->sync_request == NULL && 3379 if (oldpers->sync_request == NULL &&
3357 mddev->external) { 3380 mddev->external) {
3358 /* We are converting from a no-redundancy array 3381 /* We are converting from a no-redundancy array
3359 * to a redundancy array and metadata is managed 3382 * to a redundancy array and metadata is managed
@@ -3367,6 +3390,24 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3367 mddev->safemode = 0; 3390 mddev->safemode = 0;
3368 } 3391 }
3369 3392
3393 oldpers->free(mddev, oldpriv);
3394
3395 if (oldpers->sync_request == NULL &&
3396 pers->sync_request != NULL) {
3397 /* need to add the md_redundancy_group */
3398 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3399 printk(KERN_WARNING
3400 "md: cannot register extra attributes for %s\n",
3401 mdname(mddev));
3402 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3403 }
3404 if (oldpers->sync_request != NULL &&
3405 pers->sync_request == NULL) {
3406 /* need to remove the md_redundancy_group */
3407 if (mddev->to_remove == NULL)
3408 mddev->to_remove = &md_redundancy_group;
3409 }
3410
3370 rdev_for_each(rdev, mddev) { 3411 rdev_for_each(rdev, mddev) {
3371 if (rdev->raid_disk < 0) 3412 if (rdev->raid_disk < 0)
3372 continue; 3413 continue;
@@ -3392,17 +3433,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3392 } 3433 }
3393 } 3434 }
3394 3435
3395 module_put(mddev->pers->owner); 3436 if (pers->sync_request == NULL) {
3396 mddev->pers = pers;
3397 mddev->private = priv;
3398 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3399 mddev->level = mddev->new_level;
3400 mddev->layout = mddev->new_layout;
3401 mddev->chunk_sectors = mddev->new_chunk_sectors;
3402 mddev->delta_disks = 0;
3403 mddev->reshape_backwards = 0;
3404 mddev->degraded = 0;
3405 if (mddev->pers->sync_request == NULL) {
3406 /* this is now an array without redundancy, so 3437 /* this is now an array without redundancy, so
3407 * it must always be in_sync 3438 * it must always be in_sync
3408 */ 3439 */
@@ -3417,6 +3448,9 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3417 md_update_sb(mddev, 1); 3448 md_update_sb(mddev, 1);
3418 sysfs_notify(&mddev->kobj, NULL, "level"); 3449 sysfs_notify(&mddev->kobj, NULL, "level");
3419 md_new_event(mddev); 3450 md_new_event(mddev);
3451 rv = len;
3452out_unlock:
3453 mddev_unlock(mddev);
3420 return rv; 3454 return rv;
3421} 3455}
3422 3456
@@ -3439,28 +3473,32 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
3439{ 3473{
3440 char *e; 3474 char *e;
3441 unsigned long n = simple_strtoul(buf, &e, 10); 3475 unsigned long n = simple_strtoul(buf, &e, 10);
3476 int err;
3442 3477
3443 if (!*buf || (*e && *e != '\n')) 3478 if (!*buf || (*e && *e != '\n'))
3444 return -EINVAL; 3479 return -EINVAL;
3480 err = mddev_lock(mddev);
3481 if (err)
3482 return err;
3445 3483
3446 if (mddev->pers) { 3484 if (mddev->pers) {
3447 int err;
3448 if (mddev->pers->check_reshape == NULL) 3485 if (mddev->pers->check_reshape == NULL)
3449 return -EBUSY; 3486 err = -EBUSY;
3450 if (mddev->ro) 3487 else if (mddev->ro)
3451 return -EROFS; 3488 err = -EROFS;
3452 mddev->new_layout = n; 3489 else {
3453 err = mddev->pers->check_reshape(mddev); 3490 mddev->new_layout = n;
3454 if (err) { 3491 err = mddev->pers->check_reshape(mddev);
3455 mddev->new_layout = mddev->layout; 3492 if (err)
3456 return err; 3493 mddev->new_layout = mddev->layout;
3457 } 3494 }
3458 } else { 3495 } else {
3459 mddev->new_layout = n; 3496 mddev->new_layout = n;
3460 if (mddev->reshape_position == MaxSector) 3497 if (mddev->reshape_position == MaxSector)
3461 mddev->layout = n; 3498 mddev->layout = n;
3462 } 3499 }
3463 return len; 3500 mddev_unlock(mddev);
3501 return err ?: len;
3464} 3502}
3465static struct md_sysfs_entry md_layout = 3503static struct md_sysfs_entry md_layout =
3466__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3504__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
@@ -3483,32 +3521,39 @@ static ssize_t
3483raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3521raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3484{ 3522{
3485 char *e; 3523 char *e;
3486 int rv = 0; 3524 int err;
3487 unsigned long n = simple_strtoul(buf, &e, 10); 3525 unsigned long n = simple_strtoul(buf, &e, 10);
3488 3526
3489 if (!*buf || (*e && *e != '\n')) 3527 if (!*buf || (*e && *e != '\n'))
3490 return -EINVAL; 3528 return -EINVAL;
3491 3529
3530 err = mddev_lock(mddev);
3531 if (err)
3532 return err;
3492 if (mddev->pers) 3533 if (mddev->pers)
3493 rv = update_raid_disks(mddev, n); 3534 err = update_raid_disks(mddev, n);
3494 else if (mddev->reshape_position != MaxSector) { 3535 else if (mddev->reshape_position != MaxSector) {
3495 struct md_rdev *rdev; 3536 struct md_rdev *rdev;
3496 int olddisks = mddev->raid_disks - mddev->delta_disks; 3537 int olddisks = mddev->raid_disks - mddev->delta_disks;
3497 3538
3539 err = -EINVAL;
3498 rdev_for_each(rdev, mddev) { 3540 rdev_for_each(rdev, mddev) {
3499 if (olddisks < n && 3541 if (olddisks < n &&
3500 rdev->data_offset < rdev->new_data_offset) 3542 rdev->data_offset < rdev->new_data_offset)
3501 return -EINVAL; 3543 goto out_unlock;
3502 if (olddisks > n && 3544 if (olddisks > n &&
3503 rdev->data_offset > rdev->new_data_offset) 3545 rdev->data_offset > rdev->new_data_offset)
3504 return -EINVAL; 3546 goto out_unlock;
3505 } 3547 }
3548 err = 0;
3506 mddev->delta_disks = n - olddisks; 3549 mddev->delta_disks = n - olddisks;
3507 mddev->raid_disks = n; 3550 mddev->raid_disks = n;
3508 mddev->reshape_backwards = (mddev->delta_disks < 0); 3551 mddev->reshape_backwards = (mddev->delta_disks < 0);
3509 } else 3552 } else
3510 mddev->raid_disks = n; 3553 mddev->raid_disks = n;
3511 return rv ? rv : len; 3554out_unlock:
3555 mddev_unlock(mddev);
3556 return err ? err : len;
3512} 3557}
3513static struct md_sysfs_entry md_raid_disks = 3558static struct md_sysfs_entry md_raid_disks =
3514__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3559__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
@@ -3527,30 +3572,34 @@ chunk_size_show(struct mddev *mddev, char *page)
3527static ssize_t 3572static ssize_t
3528chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3573chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3529{ 3574{
3575 int err;
3530 char *e; 3576 char *e;
3531 unsigned long n = simple_strtoul(buf, &e, 10); 3577 unsigned long n = simple_strtoul(buf, &e, 10);
3532 3578
3533 if (!*buf || (*e && *e != '\n')) 3579 if (!*buf || (*e && *e != '\n'))
3534 return -EINVAL; 3580 return -EINVAL;
3535 3581
3582 err = mddev_lock(mddev);
3583 if (err)
3584 return err;
3536 if (mddev->pers) { 3585 if (mddev->pers) {
3537 int err;
3538 if (mddev->pers->check_reshape == NULL) 3586 if (mddev->pers->check_reshape == NULL)
3539 return -EBUSY; 3587 err = -EBUSY;
3540 if (mddev->ro) 3588 else if (mddev->ro)
3541 return -EROFS; 3589 err = -EROFS;
3542 mddev->new_chunk_sectors = n >> 9; 3590 else {
3543 err = mddev->pers->check_reshape(mddev); 3591 mddev->new_chunk_sectors = n >> 9;
3544 if (err) { 3592 err = mddev->pers->check_reshape(mddev);
3545 mddev->new_chunk_sectors = mddev->chunk_sectors; 3593 if (err)
3546 return err; 3594 mddev->new_chunk_sectors = mddev->chunk_sectors;
3547 } 3595 }
3548 } else { 3596 } else {
3549 mddev->new_chunk_sectors = n >> 9; 3597 mddev->new_chunk_sectors = n >> 9;
3550 if (mddev->reshape_position == MaxSector) 3598 if (mddev->reshape_position == MaxSector)
3551 mddev->chunk_sectors = n >> 9; 3599 mddev->chunk_sectors = n >> 9;
3552 } 3600 }
3553 return len; 3601 mddev_unlock(mddev);
3602 return err ?: len;
3554} 3603}
3555static struct md_sysfs_entry md_chunk_size = 3604static struct md_sysfs_entry md_chunk_size =
3556__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3605__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
@@ -3566,20 +3615,27 @@ resync_start_show(struct mddev *mddev, char *page)
3566static ssize_t 3615static ssize_t
3567resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3616resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3568{ 3617{
3618 int err;
3569 char *e; 3619 char *e;
3570 unsigned long long n = simple_strtoull(buf, &e, 10); 3620 unsigned long long n = simple_strtoull(buf, &e, 10);
3571 3621
3622 err = mddev_lock(mddev);
3623 if (err)
3624 return err;
3572 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3625 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3573 return -EBUSY; 3626 err = -EBUSY;
3574 if (cmd_match(buf, "none")) 3627 else if (cmd_match(buf, "none"))
3575 n = MaxSector; 3628 n = MaxSector;
3576 else if (!*buf || (*e && *e != '\n')) 3629 else if (!*buf || (*e && *e != '\n'))
3577 return -EINVAL; 3630 err = -EINVAL;
3578 3631
3579 mddev->recovery_cp = n; 3632 if (!err) {
3580 if (mddev->pers) 3633 mddev->recovery_cp = n;
3581 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3634 if (mddev->pers)
3582 return len; 3635 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3636 }
3637 mddev_unlock(mddev);
3638 return err ?: len;
3583} 3639}
3584static struct md_sysfs_entry md_resync_start = 3640static struct md_sysfs_entry md_resync_start =
3585__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 3641__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
@@ -3677,8 +3733,39 @@ static int restart_array(struct mddev *mddev);
3677static ssize_t 3733static ssize_t
3678array_state_store(struct mddev *mddev, const char *buf, size_t len) 3734array_state_store(struct mddev *mddev, const char *buf, size_t len)
3679{ 3735{
3680 int err = -EINVAL; 3736 int err;
3681 enum array_state st = match_word(buf, array_states); 3737 enum array_state st = match_word(buf, array_states);
3738
3739 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
3740 /* don't take reconfig_mutex when toggling between
3741 * clean and active
3742 */
3743 spin_lock(&mddev->lock);
3744 if (st == active) {
3745 restart_array(mddev);
3746 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3747 wake_up(&mddev->sb_wait);
3748 err = 0;
3749 } else /* st == clean */ {
3750 restart_array(mddev);
3751 if (atomic_read(&mddev->writes_pending) == 0) {
3752 if (mddev->in_sync == 0) {
3753 mddev->in_sync = 1;
3754 if (mddev->safemode == 1)
3755 mddev->safemode = 0;
3756 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3757 }
3758 err = 0;
3759 } else
3760 err = -EBUSY;
3761 }
3762 spin_unlock(&mddev->lock);
3763 return err;
3764 }
3765 err = mddev_lock(mddev);
3766 if (err)
3767 return err;
3768 err = -EINVAL;
3682 switch(st) { 3769 switch(st) {
3683 case bad_word: 3770 case bad_word:
3684 break; 3771 break;
@@ -3722,7 +3809,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3722 case clean: 3809 case clean:
3723 if (mddev->pers) { 3810 if (mddev->pers) {
3724 restart_array(mddev); 3811 restart_array(mddev);
3725 spin_lock_irq(&mddev->write_lock); 3812 spin_lock(&mddev->lock);
3726 if (atomic_read(&mddev->writes_pending) == 0) { 3813 if (atomic_read(&mddev->writes_pending) == 0) {
3727 if (mddev->in_sync == 0) { 3814 if (mddev->in_sync == 0) {
3728 mddev->in_sync = 1; 3815 mddev->in_sync = 1;
@@ -3733,7 +3820,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3733 err = 0; 3820 err = 0;
3734 } else 3821 } else
3735 err = -EBUSY; 3822 err = -EBUSY;
3736 spin_unlock_irq(&mddev->write_lock); 3823 spin_unlock(&mddev->lock);
3737 } else 3824 } else
3738 err = -EINVAL; 3825 err = -EINVAL;
3739 break; 3826 break;
@@ -3754,14 +3841,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3754 /* these cannot be set */ 3841 /* these cannot be set */
3755 break; 3842 break;
3756 } 3843 }
3757 if (err) 3844
3758 return err; 3845 if (!err) {
3759 else {
3760 if (mddev->hold_active == UNTIL_IOCTL) 3846 if (mddev->hold_active == UNTIL_IOCTL)
3761 mddev->hold_active = 0; 3847 mddev->hold_active = 0;
3762 sysfs_notify_dirent_safe(mddev->sysfs_state); 3848 sysfs_notify_dirent_safe(mddev->sysfs_state);
3763 return len;
3764 } 3849 }
3850 mddev_unlock(mddev);
3851 return err ?: len;
3765} 3852}
3766static struct md_sysfs_entry md_array_state = 3853static struct md_sysfs_entry md_array_state =
3767__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3854__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -3822,6 +3909,11 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
3822 minor != MINOR(dev)) 3909 minor != MINOR(dev))
3823 return -EOVERFLOW; 3910 return -EOVERFLOW;
3824 3911
3912 flush_workqueue(md_misc_wq);
3913
3914 err = mddev_lock(mddev);
3915 if (err)
3916 return err;
3825 if (mddev->persistent) { 3917 if (mddev->persistent) {
3826 rdev = md_import_device(dev, mddev->major_version, 3918 rdev = md_import_device(dev, mddev->major_version,
3827 mddev->minor_version); 3919 mddev->minor_version);
@@ -3845,6 +3937,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
3845 out: 3937 out:
3846 if (err) 3938 if (err)
3847 export_rdev(rdev); 3939 export_rdev(rdev);
3940 mddev_unlock(mddev);
3848 return err ? err : len; 3941 return err ? err : len;
3849} 3942}
3850 3943
@@ -3856,7 +3949,11 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
3856{ 3949{
3857 char *end; 3950 char *end;
3858 unsigned long chunk, end_chunk; 3951 unsigned long chunk, end_chunk;
3952 int err;
3859 3953
3954 err = mddev_lock(mddev);
3955 if (err)
3956 return err;
3860 if (!mddev->bitmap) 3957 if (!mddev->bitmap)
3861 goto out; 3958 goto out;
3862 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 3959 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
@@ -3874,6 +3971,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
3874 } 3971 }
3875 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 3972 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3876out: 3973out:
3974 mddev_unlock(mddev);
3877 return len; 3975 return len;
3878} 3976}
3879 3977
@@ -3901,6 +3999,9 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
3901 3999
3902 if (err < 0) 4000 if (err < 0)
3903 return err; 4001 return err;
4002 err = mddev_lock(mddev);
4003 if (err)
4004 return err;
3904 if (mddev->pers) { 4005 if (mddev->pers) {
3905 err = update_size(mddev, sectors); 4006 err = update_size(mddev, sectors);
3906 md_update_sb(mddev, 1); 4007 md_update_sb(mddev, 1);
@@ -3911,6 +4012,7 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
3911 else 4012 else
3912 err = -ENOSPC; 4013 err = -ENOSPC;
3913 } 4014 }
4015 mddev_unlock(mddev);
3914 return err ? err : len; 4016 return err ? err : len;
3915} 4017}
3916 4018
@@ -3940,21 +4042,28 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
3940{ 4042{
3941 int major, minor; 4043 int major, minor;
3942 char *e; 4044 char *e;
4045 int err;
3943 /* Changing the details of 'external' metadata is 4046 /* Changing the details of 'external' metadata is
3944 * always permitted. Otherwise there must be 4047 * always permitted. Otherwise there must be
3945 * no devices attached to the array. 4048 * no devices attached to the array.
3946 */ 4049 */
4050
4051 err = mddev_lock(mddev);
4052 if (err)
4053 return err;
4054 err = -EBUSY;
3947 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4055 if (mddev->external && strncmp(buf, "external:", 9) == 0)
3948 ; 4056 ;
3949 else if (!list_empty(&mddev->disks)) 4057 else if (!list_empty(&mddev->disks))
3950 return -EBUSY; 4058 goto out_unlock;
3951 4059
4060 err = 0;
3952 if (cmd_match(buf, "none")) { 4061 if (cmd_match(buf, "none")) {
3953 mddev->persistent = 0; 4062 mddev->persistent = 0;
3954 mddev->external = 0; 4063 mddev->external = 0;
3955 mddev->major_version = 0; 4064 mddev->major_version = 0;
3956 mddev->minor_version = 90; 4065 mddev->minor_version = 90;
3957 return len; 4066 goto out_unlock;
3958 } 4067 }
3959 if (strncmp(buf, "external:", 9) == 0) { 4068 if (strncmp(buf, "external:", 9) == 0) {
3960 size_t namelen = len-9; 4069 size_t namelen = len-9;
@@ -3968,22 +4077,27 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
3968 mddev->external = 1; 4077 mddev->external = 1;
3969 mddev->major_version = 0; 4078 mddev->major_version = 0;
3970 mddev->minor_version = 90; 4079 mddev->minor_version = 90;
3971 return len; 4080 goto out_unlock;
3972 } 4081 }
3973 major = simple_strtoul(buf, &e, 10); 4082 major = simple_strtoul(buf, &e, 10);
4083 err = -EINVAL;
3974 if (e==buf || *e != '.') 4084 if (e==buf || *e != '.')
3975 return -EINVAL; 4085 goto out_unlock;
3976 buf = e+1; 4086 buf = e+1;
3977 minor = simple_strtoul(buf, &e, 10); 4087 minor = simple_strtoul(buf, &e, 10);
3978 if (e==buf || (*e && *e != '\n') ) 4088 if (e==buf || (*e && *e != '\n') )
3979 return -EINVAL; 4089 goto out_unlock;
4090 err = -ENOENT;
3980 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4091 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3981 return -ENOENT; 4092 goto out_unlock;
3982 mddev->major_version = major; 4093 mddev->major_version = major;
3983 mddev->minor_version = minor; 4094 mddev->minor_version = minor;
3984 mddev->persistent = 1; 4095 mddev->persistent = 1;
3985 mddev->external = 0; 4096 mddev->external = 0;
3986 return len; 4097 err = 0;
4098out_unlock:
4099 mddev_unlock(mddev);
4100 return err ?: len;
3987} 4101}
3988 4102
3989static struct md_sysfs_entry md_metadata = 4103static struct md_sysfs_entry md_metadata =
@@ -3993,20 +4107,21 @@ static ssize_t
3993action_show(struct mddev *mddev, char *page) 4107action_show(struct mddev *mddev, char *page)
3994{ 4108{
3995 char *type = "idle"; 4109 char *type = "idle";
3996 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4110 unsigned long recovery = mddev->recovery;
4111 if (test_bit(MD_RECOVERY_FROZEN, &recovery))
3997 type = "frozen"; 4112 type = "frozen";
3998 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4113 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
3999 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 4114 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4000 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4115 if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4001 type = "reshape"; 4116 type = "reshape";
4002 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4117 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4003 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 4118 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4004 type = "resync"; 4119 type = "resync";
4005 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 4120 else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4006 type = "check"; 4121 type = "check";
4007 else 4122 else
4008 type = "repair"; 4123 type = "repair";
4009 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 4124 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4010 type = "recover"; 4125 type = "recover";
4011 } 4126 }
4012 return sprintf(page, "%s\n", type); 4127 return sprintf(page, "%s\n", type);
@@ -4027,7 +4142,10 @@ action_store(struct mddev *mddev, const char *page, size_t len)
4027 flush_workqueue(md_misc_wq); 4142 flush_workqueue(md_misc_wq);
4028 if (mddev->sync_thread) { 4143 if (mddev->sync_thread) {
4029 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4144 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4030 md_reap_sync_thread(mddev); 4145 if (mddev_lock(mddev) == 0) {
4146 md_reap_sync_thread(mddev);
4147 mddev_unlock(mddev);
4148 }
4031 } 4149 }
4032 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4150 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4033 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 4151 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -4041,7 +4159,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
4041 int err; 4159 int err;
4042 if (mddev->pers->start_reshape == NULL) 4160 if (mddev->pers->start_reshape == NULL)
4043 return -EINVAL; 4161 return -EINVAL;
4044 err = mddev->pers->start_reshape(mddev); 4162 err = mddev_lock(mddev);
4163 if (!err) {
4164 err = mddev->pers->start_reshape(mddev);
4165 mddev_unlock(mddev);
4166 }
4045 if (err) 4167 if (err)
4046 return err; 4168 return err;
4047 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4169 sysfs_notify(&mddev->kobj, NULL, "degraded");
@@ -4225,22 +4347,36 @@ static ssize_t
4225min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4347min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4226{ 4348{
4227 unsigned long long min; 4349 unsigned long long min;
4350 int err;
4351 int chunk;
4352
4228 if (kstrtoull(buf, 10, &min)) 4353 if (kstrtoull(buf, 10, &min))
4229 return -EINVAL; 4354 return -EINVAL;
4355
4356 spin_lock(&mddev->lock);
4357 err = -EINVAL;
4230 if (min > mddev->resync_max) 4358 if (min > mddev->resync_max)
4231 return -EINVAL; 4359 goto out_unlock;
4360
4361 err = -EBUSY;
4232 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4362 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4233 return -EBUSY; 4363 goto out_unlock;
4234 4364
4235 /* Must be a multiple of chunk_size */ 4365 /* Must be a multiple of chunk_size */
4236 if (mddev->chunk_sectors) { 4366 chunk = mddev->chunk_sectors;
4367 if (chunk) {
4237 sector_t temp = min; 4368 sector_t temp = min;
4238 if (sector_div(temp, mddev->chunk_sectors)) 4369
4239 return -EINVAL; 4370 err = -EINVAL;
4371 if (sector_div(temp, chunk))
4372 goto out_unlock;
4240 } 4373 }
4241 mddev->resync_min = min; 4374 mddev->resync_min = min;
4375 err = 0;
4242 4376
4243 return len; 4377out_unlock:
4378 spin_unlock(&mddev->lock);
4379 return err ?: len;
4244} 4380}
4245 4381
4246static struct md_sysfs_entry md_min_sync = 4382static struct md_sysfs_entry md_min_sync =
@@ -4258,29 +4394,42 @@ max_sync_show(struct mddev *mddev, char *page)
4258static ssize_t 4394static ssize_t
4259max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4395max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4260{ 4396{
4397 int err;
4398 spin_lock(&mddev->lock);
4261 if (strncmp(buf, "max", 3) == 0) 4399 if (strncmp(buf, "max", 3) == 0)
4262 mddev->resync_max = MaxSector; 4400 mddev->resync_max = MaxSector;
4263 else { 4401 else {
4264 unsigned long long max; 4402 unsigned long long max;
4403 int chunk;
4404
4405 err = -EINVAL;
4265 if (kstrtoull(buf, 10, &max)) 4406 if (kstrtoull(buf, 10, &max))
4266 return -EINVAL; 4407 goto out_unlock;
4267 if (max < mddev->resync_min) 4408 if (max < mddev->resync_min)
4268 return -EINVAL; 4409 goto out_unlock;
4410
4411 err = -EBUSY;
4269 if (max < mddev->resync_max && 4412 if (max < mddev->resync_max &&
4270 mddev->ro == 0 && 4413 mddev->ro == 0 &&
4271 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4414 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4272 return -EBUSY; 4415 goto out_unlock;
4273 4416
4274 /* Must be a multiple of chunk_size */ 4417 /* Must be a multiple of chunk_size */
4275 if (mddev->chunk_sectors) { 4418 chunk = mddev->chunk_sectors;
4419 if (chunk) {
4276 sector_t temp = max; 4420 sector_t temp = max;
4277 if (sector_div(temp, mddev->chunk_sectors)) 4421
4278 return -EINVAL; 4422 err = -EINVAL;
4423 if (sector_div(temp, chunk))
4424 goto out_unlock;
4279 } 4425 }
4280 mddev->resync_max = max; 4426 mddev->resync_max = max;
4281 } 4427 }
4282 wake_up(&mddev->recovery_wait); 4428 wake_up(&mddev->recovery_wait);
4283 return len; 4429 err = 0;
4430out_unlock:
4431 spin_unlock(&mddev->lock);
4432 return err ?: len;
4284} 4433}
4285 4434
4286static struct md_sysfs_entry md_max_sync = 4435static struct md_sysfs_entry md_max_sync =
@@ -4297,14 +4446,20 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4297{ 4446{
4298 char *e; 4447 char *e;
4299 unsigned long long new = simple_strtoull(buf, &e, 10); 4448 unsigned long long new = simple_strtoull(buf, &e, 10);
4300 unsigned long long old = mddev->suspend_lo; 4449 unsigned long long old;
4450 int err;
4301 4451
4302 if (mddev->pers == NULL ||
4303 mddev->pers->quiesce == NULL)
4304 return -EINVAL;
4305 if (buf == e || (*e && *e != '\n')) 4452 if (buf == e || (*e && *e != '\n'))
4306 return -EINVAL; 4453 return -EINVAL;
4307 4454
4455 err = mddev_lock(mddev);
4456 if (err)
4457 return err;
4458 err = -EINVAL;
4459 if (mddev->pers == NULL ||
4460 mddev->pers->quiesce == NULL)
4461 goto unlock;
4462 old = mddev->suspend_lo;
4308 mddev->suspend_lo = new; 4463 mddev->suspend_lo = new;
4309 if (new >= old) 4464 if (new >= old)
4310 /* Shrinking suspended region */ 4465 /* Shrinking suspended region */
@@ -4314,7 +4469,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4314 mddev->pers->quiesce(mddev, 1); 4469 mddev->pers->quiesce(mddev, 1);
4315 mddev->pers->quiesce(mddev, 0); 4470 mddev->pers->quiesce(mddev, 0);
4316 } 4471 }
4317 return len; 4472 err = 0;
4473unlock:
4474 mddev_unlock(mddev);
4475 return err ?: len;
4318} 4476}
4319static struct md_sysfs_entry md_suspend_lo = 4477static struct md_sysfs_entry md_suspend_lo =
4320__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4478__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4330,14 +4488,20 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4330{ 4488{
4331 char *e; 4489 char *e;
4332 unsigned long long new = simple_strtoull(buf, &e, 10); 4490 unsigned long long new = simple_strtoull(buf, &e, 10);
4333 unsigned long long old = mddev->suspend_hi; 4491 unsigned long long old;
4492 int err;
4334 4493
4335 if (mddev->pers == NULL ||
4336 mddev->pers->quiesce == NULL)
4337 return -EINVAL;
4338 if (buf == e || (*e && *e != '\n')) 4494 if (buf == e || (*e && *e != '\n'))
4339 return -EINVAL; 4495 return -EINVAL;
4340 4496
4497 err = mddev_lock(mddev);
4498 if (err)
4499 return err;
4500 err = -EINVAL;
4501 if (mddev->pers == NULL ||
4502 mddev->pers->quiesce == NULL)
4503 goto unlock;
4504 old = mddev->suspend_hi;
4341 mddev->suspend_hi = new; 4505 mddev->suspend_hi = new;
4342 if (new <= old) 4506 if (new <= old)
4343 /* Shrinking suspended region */ 4507 /* Shrinking suspended region */
@@ -4347,7 +4511,10 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4347 mddev->pers->quiesce(mddev, 1); 4511 mddev->pers->quiesce(mddev, 1);
4348 mddev->pers->quiesce(mddev, 0); 4512 mddev->pers->quiesce(mddev, 0);
4349 } 4513 }
4350 return len; 4514 err = 0;
4515unlock:
4516 mddev_unlock(mddev);
4517 return err ?: len;
4351} 4518}
4352static struct md_sysfs_entry md_suspend_hi = 4519static struct md_sysfs_entry md_suspend_hi =
4353__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4520__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4367,11 +4534,17 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4367{ 4534{
4368 struct md_rdev *rdev; 4535 struct md_rdev *rdev;
4369 char *e; 4536 char *e;
4537 int err;
4370 unsigned long long new = simple_strtoull(buf, &e, 10); 4538 unsigned long long new = simple_strtoull(buf, &e, 10);
4371 if (mddev->pers) 4539
4372 return -EBUSY;
4373 if (buf == e || (*e && *e != '\n')) 4540 if (buf == e || (*e && *e != '\n'))
4374 return -EINVAL; 4541 return -EINVAL;
4542 err = mddev_lock(mddev);
4543 if (err)
4544 return err;
4545 err = -EBUSY;
4546 if (mddev->pers)
4547 goto unlock;
4375 mddev->reshape_position = new; 4548 mddev->reshape_position = new;
4376 mddev->delta_disks = 0; 4549 mddev->delta_disks = 0;
4377 mddev->reshape_backwards = 0; 4550 mddev->reshape_backwards = 0;
@@ -4380,7 +4553,10 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4380 mddev->new_chunk_sectors = mddev->chunk_sectors; 4553 mddev->new_chunk_sectors = mddev->chunk_sectors;
4381 rdev_for_each(rdev, mddev) 4554 rdev_for_each(rdev, mddev)
4382 rdev->new_data_offset = rdev->data_offset; 4555 rdev->new_data_offset = rdev->data_offset;
4383 return len; 4556 err = 0;
4557unlock:
4558 mddev_unlock(mddev);
4559 return err ?: len;
4384} 4560}
4385 4561
4386static struct md_sysfs_entry md_reshape_position = 4562static struct md_sysfs_entry md_reshape_position =
@@ -4398,6 +4574,8 @@ static ssize_t
4398reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4574reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4399{ 4575{
4400 int backwards = 0; 4576 int backwards = 0;
4577 int err;
4578
4401 if (cmd_match(buf, "forwards")) 4579 if (cmd_match(buf, "forwards"))
4402 backwards = 0; 4580 backwards = 0;
4403 else if (cmd_match(buf, "backwards")) 4581 else if (cmd_match(buf, "backwards"))
@@ -4407,16 +4585,19 @@ reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4407 if (mddev->reshape_backwards == backwards) 4585 if (mddev->reshape_backwards == backwards)
4408 return len; 4586 return len;
4409 4587
4588 err = mddev_lock(mddev);
4589 if (err)
4590 return err;
4410 /* check if we are allowed to change */ 4591 /* check if we are allowed to change */
4411 if (mddev->delta_disks) 4592 if (mddev->delta_disks)
4412 return -EBUSY; 4593 err = -EBUSY;
4413 4594 else if (mddev->persistent &&
4414 if (mddev->persistent &&
4415 mddev->major_version == 0) 4595 mddev->major_version == 0)
4416 return -EINVAL; 4596 err = -EINVAL;
4417 4597 else
4418 mddev->reshape_backwards = backwards; 4598 mddev->reshape_backwards = backwards;
4419 return len; 4599 mddev_unlock(mddev);
4600 return err ?: len;
4420} 4601}
4421 4602
4422static struct md_sysfs_entry md_reshape_direction = 4603static struct md_sysfs_entry md_reshape_direction =
@@ -4437,6 +4618,11 @@ static ssize_t
4437array_size_store(struct mddev *mddev, const char *buf, size_t len) 4618array_size_store(struct mddev *mddev, const char *buf, size_t len)
4438{ 4619{
4439 sector_t sectors; 4620 sector_t sectors;
4621 int err;
4622
4623 err = mddev_lock(mddev);
4624 if (err)
4625 return err;
4440 4626
4441 if (strncmp(buf, "default", 7) == 0) { 4627 if (strncmp(buf, "default", 7) == 0) {
4442 if (mddev->pers) 4628 if (mddev->pers)
@@ -4447,19 +4633,22 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
4447 mddev->external_size = 0; 4633 mddev->external_size = 0;
4448 } else { 4634 } else {
4449 if (strict_blocks_to_sectors(buf, &sectors) < 0) 4635 if (strict_blocks_to_sectors(buf, &sectors) < 0)
4450 return -EINVAL; 4636 err = -EINVAL;
4451 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 4637 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4452 return -E2BIG; 4638 err = -E2BIG;
4453 4639 else
4454 mddev->external_size = 1; 4640 mddev->external_size = 1;
4455 } 4641 }
4456 4642
4457 mddev->array_sectors = sectors; 4643 if (!err) {
4458 if (mddev->pers) { 4644 mddev->array_sectors = sectors;
4459 set_capacity(mddev->gendisk, mddev->array_sectors); 4645 if (mddev->pers) {
4460 revalidate_disk(mddev->gendisk); 4646 set_capacity(mddev->gendisk, mddev->array_sectors);
4647 revalidate_disk(mddev->gendisk);
4648 }
4461 } 4649 }
4462 return len; 4650 mddev_unlock(mddev);
4651 return err ?: len;
4463} 4652}
4464 4653
4465static struct md_sysfs_entry md_array_size = 4654static struct md_sysfs_entry md_array_size =
@@ -4523,11 +4712,7 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4523 mddev_get(mddev); 4712 mddev_get(mddev);
4524 spin_unlock(&all_mddevs_lock); 4713 spin_unlock(&all_mddevs_lock);
4525 4714
4526 rv = mddev_lock(mddev); 4715 rv = entry->show(mddev, page);
4527 if (!rv) {
4528 rv = entry->show(mddev, page);
4529 mddev_unlock(mddev);
4530 }
4531 mddev_put(mddev); 4716 mddev_put(mddev);
4532 return rv; 4717 return rv;
4533} 4718}
@@ -4551,13 +4736,7 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
4551 } 4736 }
4552 mddev_get(mddev); 4737 mddev_get(mddev);
4553 spin_unlock(&all_mddevs_lock); 4738 spin_unlock(&all_mddevs_lock);
4554 if (entry->store == new_dev_store) 4739 rv = entry->store(mddev, page, length);
4555 flush_workqueue(md_misc_wq);
4556 rv = mddev_lock(mddev);
4557 if (!rv) {
4558 rv = entry->store(mddev, page, length);
4559 mddev_unlock(mddev);
4560 }
4561 mddev_put(mddev); 4740 mddev_put(mddev);
4562 return rv; 4741 return rv;
4563} 4742}
@@ -4825,7 +5004,6 @@ int md_run(struct mddev *mddev)
4825 mddev->clevel); 5004 mddev->clevel);
4826 return -EINVAL; 5005 return -EINVAL;
4827 } 5006 }
4828 mddev->pers = pers;
4829 spin_unlock(&pers_lock); 5007 spin_unlock(&pers_lock);
4830 if (mddev->level != pers->level) { 5008 if (mddev->level != pers->level) {
4831 mddev->level = pers->level; 5009 mddev->level = pers->level;
@@ -4836,7 +5014,6 @@ int md_run(struct mddev *mddev)
4836 if (mddev->reshape_position != MaxSector && 5014 if (mddev->reshape_position != MaxSector &&
4837 pers->start_reshape == NULL) { 5015 pers->start_reshape == NULL) {
4838 /* This personality cannot handle reshaping... */ 5016 /* This personality cannot handle reshaping... */
4839 mddev->pers = NULL;
4840 module_put(pers->owner); 5017 module_put(pers->owner);
4841 return -EINVAL; 5018 return -EINVAL;
4842 } 5019 }
@@ -4880,35 +5057,38 @@ int md_run(struct mddev *mddev)
4880 if (start_readonly && mddev->ro == 0) 5057 if (start_readonly && mddev->ro == 0)
4881 mddev->ro = 2; /* read-only, but switch on first write */ 5058 mddev->ro = 2; /* read-only, but switch on first write */
4882 5059
4883 err = mddev->pers->run(mddev); 5060 err = pers->run(mddev);
4884 if (err) 5061 if (err)
4885 printk(KERN_ERR "md: pers->run() failed ...\n"); 5062 printk(KERN_ERR "md: pers->run() failed ...\n");
4886 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { 5063 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
4887 WARN_ONCE(!mddev->external_size, "%s: default size too small," 5064 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4888 " but 'external_size' not in effect?\n", __func__); 5065 " but 'external_size' not in effect?\n", __func__);
4889 printk(KERN_ERR 5066 printk(KERN_ERR
4890 "md: invalid array_size %llu > default size %llu\n", 5067 "md: invalid array_size %llu > default size %llu\n",
4891 (unsigned long long)mddev->array_sectors / 2, 5068 (unsigned long long)mddev->array_sectors / 2,
4892 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); 5069 (unsigned long long)pers->size(mddev, 0, 0) / 2);
4893 err = -EINVAL; 5070 err = -EINVAL;
4894 mddev->pers->stop(mddev);
4895 } 5071 }
4896 if (err == 0 && mddev->pers->sync_request && 5072 if (err == 0 && pers->sync_request &&
4897 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5073 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
4898 err = bitmap_create(mddev); 5074 err = bitmap_create(mddev);
4899 if (err) { 5075 if (err)
4900 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 5076 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4901 mdname(mddev), err); 5077 mdname(mddev), err);
4902 mddev->pers->stop(mddev);
4903 }
4904 } 5078 }
4905 if (err) { 5079 if (err) {
4906 module_put(mddev->pers->owner); 5080 mddev_detach(mddev);
4907 mddev->pers = NULL; 5081 pers->free(mddev, mddev->private);
5082 module_put(pers->owner);
4908 bitmap_destroy(mddev); 5083 bitmap_destroy(mddev);
4909 return err; 5084 return err;
4910 } 5085 }
4911 if (mddev->pers->sync_request) { 5086 if (mddev->queue) {
5087 mddev->queue->backing_dev_info.congested_data = mddev;
5088 mddev->queue->backing_dev_info.congested_fn = md_congested;
5089 blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
5090 }
5091 if (pers->sync_request) {
4912 if (mddev->kobj.sd && 5092 if (mddev->kobj.sd &&
4913 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5093 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4914 printk(KERN_WARNING 5094 printk(KERN_WARNING
@@ -4927,7 +5107,10 @@ int md_run(struct mddev *mddev)
4927 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5107 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4928 mddev->in_sync = 1; 5108 mddev->in_sync = 1;
4929 smp_wmb(); 5109 smp_wmb();
5110 spin_lock(&mddev->lock);
5111 mddev->pers = pers;
4930 mddev->ready = 1; 5112 mddev->ready = 1;
5113 spin_unlock(&mddev->lock);
4931 rdev_for_each(rdev, mddev) 5114 rdev_for_each(rdev, mddev)
4932 if (rdev->raid_disk >= 0) 5115 if (rdev->raid_disk >= 0)
4933 if (sysfs_link_rdev(mddev, rdev)) 5116 if (sysfs_link_rdev(mddev, rdev))
@@ -5070,14 +5253,38 @@ void md_stop_writes(struct mddev *mddev)
5070} 5253}
5071EXPORT_SYMBOL_GPL(md_stop_writes); 5254EXPORT_SYMBOL_GPL(md_stop_writes);
5072 5255
5256static void mddev_detach(struct mddev *mddev)
5257{
5258 struct bitmap *bitmap = mddev->bitmap;
5259 /* wait for behind writes to complete */
5260 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
5261 printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
5262 mdname(mddev));
5263 /* need to kick something here to make sure I/O goes? */
5264 wait_event(bitmap->behind_wait,
5265 atomic_read(&bitmap->behind_writes) == 0);
5266 }
5267 if (mddev->pers && mddev->pers->quiesce) {
5268 mddev->pers->quiesce(mddev, 1);
5269 mddev->pers->quiesce(mddev, 0);
5270 }
5271 md_unregister_thread(&mddev->thread);
5272 if (mddev->queue)
5273 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5274}
5275
5073static void __md_stop(struct mddev *mddev) 5276static void __md_stop(struct mddev *mddev)
5074{ 5277{
5278 struct md_personality *pers = mddev->pers;
5279 mddev_detach(mddev);
5280 spin_lock(&mddev->lock);
5075 mddev->ready = 0; 5281 mddev->ready = 0;
5076 mddev->pers->stop(mddev);
5077 if (mddev->pers->sync_request && mddev->to_remove == NULL)
5078 mddev->to_remove = &md_redundancy_group;
5079 module_put(mddev->pers->owner);
5080 mddev->pers = NULL; 5282 mddev->pers = NULL;
5283 spin_unlock(&mddev->lock);
5284 pers->free(mddev, mddev->private);
5285 if (pers->sync_request && mddev->to_remove == NULL)
5286 mddev->to_remove = &md_redundancy_group;
5287 module_put(pers->owner);
5081 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5288 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5082} 5289}
5083 5290
@@ -5226,8 +5433,11 @@ static int do_md_stop(struct mddev *mddev, int mode,
5226 5433
5227 bitmap_destroy(mddev); 5434 bitmap_destroy(mddev);
5228 if (mddev->bitmap_info.file) { 5435 if (mddev->bitmap_info.file) {
5229 fput(mddev->bitmap_info.file); 5436 struct file *f = mddev->bitmap_info.file;
5437 spin_lock(&mddev->lock);
5230 mddev->bitmap_info.file = NULL; 5438 mddev->bitmap_info.file = NULL;
5439 spin_unlock(&mddev->lock);
5440 fput(f);
5231 } 5441 }
5232 mddev->bitmap_info.offset = 0; 5442 mddev->bitmap_info.offset = 0;
5233 5443
@@ -5436,37 +5646,31 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
5436static int get_bitmap_file(struct mddev *mddev, void __user * arg) 5646static int get_bitmap_file(struct mddev *mddev, void __user * arg)
5437{ 5647{
5438 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 5648 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5439 char *ptr, *buf = NULL; 5649 char *ptr;
5440 int err = -ENOMEM; 5650 int err;
5441 5651
5442 file = kmalloc(sizeof(*file), GFP_NOIO); 5652 file = kmalloc(sizeof(*file), GFP_NOIO);
5443
5444 if (!file) 5653 if (!file)
5445 goto out; 5654 return -ENOMEM;
5446 5655
5656 err = 0;
5657 spin_lock(&mddev->lock);
5447 /* bitmap disabled, zero the first byte and copy out */ 5658 /* bitmap disabled, zero the first byte and copy out */
5448 if (!mddev->bitmap || !mddev->bitmap->storage.file) { 5659 if (!mddev->bitmap_info.file)
5449 file->pathname[0] = '\0'; 5660 file->pathname[0] = '\0';
5450 goto copy_out; 5661 else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
5451 } 5662 file->pathname, sizeof(file->pathname))),
5452 5663 IS_ERR(ptr))
5453 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 5664 err = PTR_ERR(ptr);
5454 if (!buf) 5665 else
5455 goto out; 5666 memmove(file->pathname, ptr,
5456 5667 sizeof(file->pathname)-(ptr-file->pathname));
5457 ptr = d_path(&mddev->bitmap->storage.file->f_path, 5668 spin_unlock(&mddev->lock);
5458 buf, sizeof(file->pathname));
5459 if (IS_ERR(ptr))
5460 goto out;
5461
5462 strcpy(file->pathname, ptr);
5463 5669
5464copy_out: 5670 if (err == 0 &&
5465 err = 0; 5671 copy_to_user(arg, file, sizeof(*file)))
5466 if (copy_to_user(arg, file, sizeof(*file)))
5467 err = -EFAULT; 5672 err = -EFAULT;
5468out: 5673
5469 kfree(buf);
5470 kfree(file); 5674 kfree(file);
5471 return err; 5675 return err;
5472} 5676}
@@ -5789,22 +5993,24 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
5789 5993
5790 if (fd >= 0) { 5994 if (fd >= 0) {
5791 struct inode *inode; 5995 struct inode *inode;
5792 if (mddev->bitmap) 5996 struct file *f;
5997
5998 if (mddev->bitmap || mddev->bitmap_info.file)
5793 return -EEXIST; /* cannot add when bitmap is present */ 5999 return -EEXIST; /* cannot add when bitmap is present */
5794 mddev->bitmap_info.file = fget(fd); 6000 f = fget(fd);
5795 6001
5796 if (mddev->bitmap_info.file == NULL) { 6002 if (f == NULL) {
5797 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 6003 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5798 mdname(mddev)); 6004 mdname(mddev));
5799 return -EBADF; 6005 return -EBADF;
5800 } 6006 }
5801 6007
5802 inode = mddev->bitmap_info.file->f_mapping->host; 6008 inode = f->f_mapping->host;
5803 if (!S_ISREG(inode->i_mode)) { 6009 if (!S_ISREG(inode->i_mode)) {
5804 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", 6010 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
5805 mdname(mddev)); 6011 mdname(mddev));
5806 err = -EBADF; 6012 err = -EBADF;
5807 } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { 6013 } else if (!(f->f_mode & FMODE_WRITE)) {
5808 printk(KERN_ERR "%s: error: bitmap file must open for write\n", 6014 printk(KERN_ERR "%s: error: bitmap file must open for write\n",
5809 mdname(mddev)); 6015 mdname(mddev));
5810 err = -EBADF; 6016 err = -EBADF;
@@ -5814,10 +6020,10 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
5814 err = -EBUSY; 6020 err = -EBUSY;
5815 } 6021 }
5816 if (err) { 6022 if (err) {
5817 fput(mddev->bitmap_info.file); 6023 fput(f);
5818 mddev->bitmap_info.file = NULL;
5819 return err; 6024 return err;
5820 } 6025 }
6026 mddev->bitmap_info.file = f;
5821 mddev->bitmap_info.offset = 0; /* file overrides offset */ 6027 mddev->bitmap_info.offset = 0; /* file overrides offset */
5822 } else if (mddev->bitmap == NULL) 6028 } else if (mddev->bitmap == NULL)
5823 return -ENOENT; /* cannot remove what isn't there */ 6029 return -ENOENT; /* cannot remove what isn't there */
@@ -5836,9 +6042,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
5836 mddev->pers->quiesce(mddev, 0); 6042 mddev->pers->quiesce(mddev, 0);
5837 } 6043 }
5838 if (fd < 0) { 6044 if (fd < 0) {
5839 if (mddev->bitmap_info.file) 6045 struct file *f = mddev->bitmap_info.file;
5840 fput(mddev->bitmap_info.file); 6046 if (f) {
5841 mddev->bitmap_info.file = NULL; 6047 spin_lock(&mddev->lock);
6048 mddev->bitmap_info.file = NULL;
6049 spin_unlock(&mddev->lock);
6050 fput(f);
6051 }
5842 } 6052 }
5843 6053
5844 return err; 6054 return err;
@@ -6251,6 +6461,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6251 case SET_DISK_FAULTY: 6461 case SET_DISK_FAULTY:
6252 err = set_disk_faulty(mddev, new_decode_dev(arg)); 6462 err = set_disk_faulty(mddev, new_decode_dev(arg));
6253 goto out; 6463 goto out;
6464
6465 case GET_BITMAP_FILE:
6466 err = get_bitmap_file(mddev, argp);
6467 goto out;
6468
6254 } 6469 }
6255 6470
6256 if (cmd == ADD_NEW_DISK) 6471 if (cmd == ADD_NEW_DISK)
@@ -6342,10 +6557,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6342 * Commands even a read-only array can execute: 6557 * Commands even a read-only array can execute:
6343 */ 6558 */
6344 switch (cmd) { 6559 switch (cmd) {
6345 case GET_BITMAP_FILE:
6346 err = get_bitmap_file(mddev, argp);
6347 goto unlock;
6348
6349 case RESTART_ARRAY_RW: 6560 case RESTART_ARRAY_RW:
6350 err = restart_array(mddev); 6561 err = restart_array(mddev);
6351 goto unlock; 6562 goto unlock;
@@ -6873,9 +7084,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6873 return 0; 7084 return 0;
6874 } 7085 }
6875 7086
6876 if (mddev_lock(mddev) < 0) 7087 spin_lock(&mddev->lock);
6877 return -EINTR;
6878
6879 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7088 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
6880 seq_printf(seq, "%s : %sactive", mdname(mddev), 7089 seq_printf(seq, "%s : %sactive", mdname(mddev),
6881 mddev->pers ? "" : "in"); 7090 mddev->pers ? "" : "in");
@@ -6888,7 +7097,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
6888 } 7097 }
6889 7098
6890 sectors = 0; 7099 sectors = 0;
6891 rdev_for_each(rdev, mddev) { 7100 rcu_read_lock();
7101 rdev_for_each_rcu(rdev, mddev) {
6892 char b[BDEVNAME_SIZE]; 7102 char b[BDEVNAME_SIZE];
6893 seq_printf(seq, " %s[%d]", 7103 seq_printf(seq, " %s[%d]",
6894 bdevname(rdev->bdev,b), rdev->desc_nr); 7104 bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6904,6 +7114,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6904 seq_printf(seq, "(R)"); 7114 seq_printf(seq, "(R)");
6905 sectors += rdev->sectors; 7115 sectors += rdev->sectors;
6906 } 7116 }
7117 rcu_read_unlock();
6907 7118
6908 if (!list_empty(&mddev->disks)) { 7119 if (!list_empty(&mddev->disks)) {
6909 if (mddev->pers) 7120 if (mddev->pers)
@@ -6946,7 +7157,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6946 7157
6947 seq_printf(seq, "\n"); 7158 seq_printf(seq, "\n");
6948 } 7159 }
6949 mddev_unlock(mddev); 7160 spin_unlock(&mddev->lock);
6950 7161
6951 return 0; 7162 return 0;
6952} 7163}
@@ -7102,7 +7313,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
7102 if (mddev->safemode == 1) 7313 if (mddev->safemode == 1)
7103 mddev->safemode = 0; 7314 mddev->safemode = 0;
7104 if (mddev->in_sync) { 7315 if (mddev->in_sync) {
7105 spin_lock_irq(&mddev->write_lock); 7316 spin_lock(&mddev->lock);
7106 if (mddev->in_sync) { 7317 if (mddev->in_sync) {
7107 mddev->in_sync = 0; 7318 mddev->in_sync = 0;
7108 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7319 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
@@ -7110,7 +7321,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
7110 md_wakeup_thread(mddev->thread); 7321 md_wakeup_thread(mddev->thread);
7111 did_change = 1; 7322 did_change = 1;
7112 } 7323 }
7113 spin_unlock_irq(&mddev->write_lock); 7324 spin_unlock(&mddev->lock);
7114 } 7325 }
7115 if (did_change) 7326 if (did_change)
7116 sysfs_notify_dirent_safe(mddev->sysfs_state); 7327 sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -7148,7 +7359,7 @@ int md_allow_write(struct mddev *mddev)
7148 if (!mddev->pers->sync_request) 7359 if (!mddev->pers->sync_request)
7149 return 0; 7360 return 0;
7150 7361
7151 spin_lock_irq(&mddev->write_lock); 7362 spin_lock(&mddev->lock);
7152 if (mddev->in_sync) { 7363 if (mddev->in_sync) {
7153 mddev->in_sync = 0; 7364 mddev->in_sync = 0;
7154 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7365 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
@@ -7156,11 +7367,11 @@ int md_allow_write(struct mddev *mddev)
7156 if (mddev->safemode_delay && 7367 if (mddev->safemode_delay &&
7157 mddev->safemode == 0) 7368 mddev->safemode == 0)
7158 mddev->safemode = 1; 7369 mddev->safemode = 1;
7159 spin_unlock_irq(&mddev->write_lock); 7370 spin_unlock(&mddev->lock);
7160 md_update_sb(mddev, 0); 7371 md_update_sb(mddev, 0);
7161 sysfs_notify_dirent_safe(mddev->sysfs_state); 7372 sysfs_notify_dirent_safe(mddev->sysfs_state);
7162 } else 7373 } else
7163 spin_unlock_irq(&mddev->write_lock); 7374 spin_unlock(&mddev->lock);
7164 7375
7165 if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 7376 if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
7166 return -EAGAIN; 7377 return -EAGAIN;
@@ -7513,6 +7724,7 @@ void md_do_sync(struct md_thread *thread)
7513 skip: 7724 skip:
7514 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7725 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7515 7726
7727 spin_lock(&mddev->lock);
7516 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7728 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7517 /* We completed so min/max setting can be forgotten if used. */ 7729 /* We completed so min/max setting can be forgotten if used. */
7518 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7730 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
@@ -7521,6 +7733,8 @@ void md_do_sync(struct md_thread *thread)
7521 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7733 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7522 mddev->resync_min = mddev->curr_resync_completed; 7734 mddev->resync_min = mddev->curr_resync_completed;
7523 mddev->curr_resync = 0; 7735 mddev->curr_resync = 0;
7736 spin_unlock(&mddev->lock);
7737
7524 wake_up(&resync_wait); 7738 wake_up(&resync_wait);
7525 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 7739 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7526 md_wakeup_thread(mddev->thread); 7740 md_wakeup_thread(mddev->thread);
@@ -7688,7 +7902,7 @@ void md_check_recovery(struct mddev *mddev)
7688 7902
7689 if (!mddev->external) { 7903 if (!mddev->external) {
7690 int did_change = 0; 7904 int did_change = 0;
7691 spin_lock_irq(&mddev->write_lock); 7905 spin_lock(&mddev->lock);
7692 if (mddev->safemode && 7906 if (mddev->safemode &&
7693 !atomic_read(&mddev->writes_pending) && 7907 !atomic_read(&mddev->writes_pending) &&
7694 !mddev->in_sync && 7908 !mddev->in_sync &&
@@ -7699,7 +7913,7 @@ void md_check_recovery(struct mddev *mddev)
7699 } 7913 }
7700 if (mddev->safemode == 1) 7914 if (mddev->safemode == 1)
7701 mddev->safemode = 0; 7915 mddev->safemode = 0;
7702 spin_unlock_irq(&mddev->write_lock); 7916 spin_unlock(&mddev->lock);
7703 if (did_change) 7917 if (did_change)
7704 sysfs_notify_dirent_safe(mddev->sysfs_state); 7918 sysfs_notify_dirent_safe(mddev->sysfs_state);
7705 } 7919 }
@@ -7721,7 +7935,9 @@ void md_check_recovery(struct mddev *mddev)
7721 * any transients in the value of "sync_action". 7935 * any transients in the value of "sync_action".
7722 */ 7936 */
7723 mddev->curr_resync_completed = 0; 7937 mddev->curr_resync_completed = 0;
7938 spin_lock(&mddev->lock);
7724 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7939 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7940 spin_unlock(&mddev->lock);
7725 /* Clear some bits that don't mean anything, but 7941 /* Clear some bits that don't mean anything, but
7726 * might be left set 7942 * might be left set
7727 */ 7943 */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 03cec5bdcaae..318ca8fd430f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -386,7 +386,18 @@ struct mddev {
386 386
387 struct work_struct del_work; /* used for delayed sysfs removal */ 387 struct work_struct del_work; /* used for delayed sysfs removal */
388 388
389 spinlock_t write_lock; 389 /* "lock" protects:
390 * flush_bio transition from NULL to !NULL
391 * rdev superblocks, events
392 * clearing MD_CHANGE_*
393 * in_sync - and related safemode and MD_CHANGE changes
394 * pers (also protected by reconfig_mutex and pending IO).
395 * clearing ->bitmap
396 * clearing ->bitmap_info.file
397 * changing ->resync_{min,max}
398 * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
399 */
400 spinlock_t lock;
390 wait_queue_head_t sb_wait; /* for waiting on superblock updates */ 401 wait_queue_head_t sb_wait; /* for waiting on superblock updates */
391 atomic_t pending_writes; /* number of active superblock writes */ 402 atomic_t pending_writes; /* number of active superblock writes */
392 403
@@ -439,13 +450,30 @@ struct mddev {
439 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); 450 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
440}; 451};
441 452
442static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) 453static inline int __must_check mddev_lock(struct mddev *mddev)
443{ 454{
444 int faulty = test_bit(Faulty, &rdev->flags); 455 return mutex_lock_interruptible(&mddev->reconfig_mutex);
445 if (atomic_dec_and_test(&rdev->nr_pending) && faulty) 456}
446 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 457
458/* Sometimes we need to take the lock in a situation where
459 * failure due to interrupts is not acceptable.
460 */
461static inline void mddev_lock_nointr(struct mddev *mddev)
462{
463 mutex_lock(&mddev->reconfig_mutex);
464}
465
466static inline int mddev_is_locked(struct mddev *mddev)
467{
468 return mutex_is_locked(&mddev->reconfig_mutex);
447} 469}
448 470
471static inline int mddev_trylock(struct mddev *mddev)
472{
473 return mutex_trylock(&mddev->reconfig_mutex);
474}
475extern void mddev_unlock(struct mddev *mddev);
476
449static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) 477static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
450{ 478{
451 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); 479 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
@@ -459,7 +487,7 @@ struct md_personality
459 struct module *owner; 487 struct module *owner;
460 void (*make_request)(struct mddev *mddev, struct bio *bio); 488 void (*make_request)(struct mddev *mddev, struct bio *bio);
461 int (*run)(struct mddev *mddev); 489 int (*run)(struct mddev *mddev);
462 int (*stop)(struct mddev *mddev); 490 void (*free)(struct mddev *mddev, void *priv);
463 void (*status)(struct seq_file *seq, struct mddev *mddev); 491 void (*status)(struct seq_file *seq, struct mddev *mddev);
464 /* error_handler must set ->faulty and clear ->in_sync 492 /* error_handler must set ->faulty and clear ->in_sync
465 * if appropriate, and should abort recovery if needed 493 * if appropriate, and should abort recovery if needed
@@ -490,6 +518,13 @@ struct md_personality
490 * array. 518 * array.
491 */ 519 */
492 void *(*takeover) (struct mddev *mddev); 520 void *(*takeover) (struct mddev *mddev);
521 /* congested implements bdi.congested_fn().
522 * Will not be called while array is 'suspended' */
523 int (*congested)(struct mddev *mddev, int bits);
524 /* mergeable_bvec is use to implement ->merge_bvec_fn */
525 int (*mergeable_bvec)(struct mddev *mddev,
526 struct bvec_merge_data *bvm,
527 struct bio_vec *biovec);
493}; 528};
494 529
495struct md_sysfs_entry { 530struct md_sysfs_entry {
@@ -624,4 +659,14 @@ static inline int mddev_check_plugged(struct mddev *mddev)
624 return !!blk_check_plugged(md_unplug, mddev, 659 return !!blk_check_plugged(md_unplug, mddev,
625 sizeof(struct blk_plug_cb)); 660 sizeof(struct blk_plug_cb));
626} 661}
662
663static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
664{
665 int faulty = test_bit(Faulty, &rdev->flags);
666 if (atomic_dec_and_test(&rdev->nr_pending) && faulty) {
667 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
668 md_wakeup_thread(mddev->thread);
669 }
670}
671
627#endif /* _MD_MD_H */ 672#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 399272f9c042..ac3ede2bd00e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -153,15 +153,11 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev)
153 seq_printf (seq, "]"); 153 seq_printf (seq, "]");
154} 154}
155 155
156static int multipath_congested(void *data, int bits) 156static int multipath_congested(struct mddev *mddev, int bits)
157{ 157{
158 struct mddev *mddev = data;
159 struct mpconf *conf = mddev->private; 158 struct mpconf *conf = mddev->private;
160 int i, ret = 0; 159 int i, ret = 0;
161 160
162 if (mddev_congested(mddev, bits))
163 return 1;
164
165 rcu_read_lock(); 161 rcu_read_lock();
166 for (i = 0; i < mddev->raid_disks ; i++) { 162 for (i = 0; i < mddev->raid_disks ; i++) {
167 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); 163 struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
@@ -403,7 +399,7 @@ static int multipath_run (struct mddev *mddev)
403 /* 399 /*
404 * copy the already verified devices into our private MULTIPATH 400 * copy the already verified devices into our private MULTIPATH
405 * bookkeeping area. [whatever we allocate in multipath_run(), 401 * bookkeeping area. [whatever we allocate in multipath_run(),
406 * should be freed in multipath_stop()] 402 * should be freed in multipath_free()]
407 */ 403 */
408 404
409 conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); 405 conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
@@ -489,9 +485,6 @@ static int multipath_run (struct mddev *mddev)
489 */ 485 */
490 md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); 486 md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
491 487
492 mddev->queue->backing_dev_info.congested_fn = multipath_congested;
493 mddev->queue->backing_dev_info.congested_data = mddev;
494
495 if (md_integrity_register(mddev)) 488 if (md_integrity_register(mddev))
496 goto out_free_conf; 489 goto out_free_conf;
497 490
@@ -507,17 +500,13 @@ out:
507 return -EIO; 500 return -EIO;
508} 501}
509 502
510static int multipath_stop (struct mddev *mddev) 503static void multipath_free(struct mddev *mddev, void *priv)
511{ 504{
512 struct mpconf *conf = mddev->private; 505 struct mpconf *conf = priv;
513 506
514 md_unregister_thread(&mddev->thread);
515 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
516 mempool_destroy(conf->pool); 507 mempool_destroy(conf->pool);
517 kfree(conf->multipaths); 508 kfree(conf->multipaths);
518 kfree(conf); 509 kfree(conf);
519 mddev->private = NULL;
520 return 0;
521} 510}
522 511
523static struct md_personality multipath_personality = 512static struct md_personality multipath_personality =
@@ -527,12 +516,13 @@ static struct md_personality multipath_personality =
527 .owner = THIS_MODULE, 516 .owner = THIS_MODULE,
528 .make_request = multipath_make_request, 517 .make_request = multipath_make_request,
529 .run = multipath_run, 518 .run = multipath_run,
530 .stop = multipath_stop, 519 .free = multipath_free,
531 .status = multipath_status, 520 .status = multipath_status,
532 .error_handler = multipath_error, 521 .error_handler = multipath_error,
533 .hot_add_disk = multipath_add_disk, 522 .hot_add_disk = multipath_add_disk,
534 .hot_remove_disk= multipath_remove_disk, 523 .hot_remove_disk= multipath_remove_disk,
535 .size = multipath_size, 524 .size = multipath_size,
525 .congested = multipath_congested,
536}; 526};
537 527
538static int __init multipath_init (void) 528static int __init multipath_init (void)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ba6b85de96d2..a13f738a7b39 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,17 +25,13 @@
25#include "raid0.h" 25#include "raid0.h"
26#include "raid5.h" 26#include "raid5.h"
27 27
28static int raid0_congested(void *data, int bits) 28static int raid0_congested(struct mddev *mddev, int bits)
29{ 29{
30 struct mddev *mddev = data;
31 struct r0conf *conf = mddev->private; 30 struct r0conf *conf = mddev->private;
32 struct md_rdev **devlist = conf->devlist; 31 struct md_rdev **devlist = conf->devlist;
33 int raid_disks = conf->strip_zone[0].nb_dev; 32 int raid_disks = conf->strip_zone[0].nb_dev;
34 int i, ret = 0; 33 int i, ret = 0;
35 34
36 if (mddev_congested(mddev, bits))
37 return 1;
38
39 for (i = 0; i < raid_disks && !ret ; i++) { 35 for (i = 0; i < raid_disks && !ret ; i++) {
40 struct request_queue *q = bdev_get_queue(devlist[i]->bdev); 36 struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
41 37
@@ -263,8 +259,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
263 mdname(mddev), 259 mdname(mddev),
264 (unsigned long long)smallest->sectors); 260 (unsigned long long)smallest->sectors);
265 } 261 }
266 mddev->queue->backing_dev_info.congested_fn = raid0_congested;
267 mddev->queue->backing_dev_info.congested_data = mddev;
268 262
269 /* 263 /*
270 * now since we have the hard sector sizes, we can make sure 264 * now since we have the hard sector sizes, we can make sure
@@ -356,17 +350,16 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
356 350
357/** 351/**
358 * raid0_mergeable_bvec -- tell bio layer if two requests can be merged 352 * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
359 * @q: request queue 353 * @mddev: the md device
360 * @bvm: properties of new bio 354 * @bvm: properties of new bio
361 * @biovec: the request that could be merged to it. 355 * @biovec: the request that could be merged to it.
362 * 356 *
363 * Return amount of bytes we can accept at this offset 357 * Return amount of bytes we can accept at this offset
364 */ 358 */
365static int raid0_mergeable_bvec(struct request_queue *q, 359static int raid0_mergeable_bvec(struct mddev *mddev,
366 struct bvec_merge_data *bvm, 360 struct bvec_merge_data *bvm,
367 struct bio_vec *biovec) 361 struct bio_vec *biovec)
368{ 362{
369 struct mddev *mddev = q->queuedata;
370 struct r0conf *conf = mddev->private; 363 struct r0conf *conf = mddev->private;
371 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 364 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
372 sector_t sector_offset = sector; 365 sector_t sector_offset = sector;
@@ -422,7 +415,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
422 return array_sectors; 415 return array_sectors;
423} 416}
424 417
425static int raid0_stop(struct mddev *mddev); 418static void raid0_free(struct mddev *mddev, void *priv);
426 419
427static int raid0_run(struct mddev *mddev) 420static int raid0_run(struct mddev *mddev)
428{ 421{
@@ -471,26 +464,22 @@ static int raid0_run(struct mddev *mddev)
471 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 464 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
472 } 465 }
473 466
474 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
475 dump_zones(mddev); 467 dump_zones(mddev);
476 468
477 ret = md_integrity_register(mddev); 469 ret = md_integrity_register(mddev);
478 if (ret) 470 if (ret)
479 raid0_stop(mddev); 471 raid0_free(mddev, conf);
480 472
481 return ret; 473 return ret;
482} 474}
483 475
484static int raid0_stop(struct mddev *mddev) 476static void raid0_free(struct mddev *mddev, void *priv)
485{ 477{
486 struct r0conf *conf = mddev->private; 478 struct r0conf *conf = priv;
487 479
488 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
489 kfree(conf->strip_zone); 480 kfree(conf->strip_zone);
490 kfree(conf->devlist); 481 kfree(conf->devlist);
491 kfree(conf); 482 kfree(conf);
492 mddev->private = NULL;
493 return 0;
494} 483}
495 484
496/* 485/*
@@ -724,11 +713,13 @@ static struct md_personality raid0_personality=
724 .owner = THIS_MODULE, 713 .owner = THIS_MODULE,
725 .make_request = raid0_make_request, 714 .make_request = raid0_make_request,
726 .run = raid0_run, 715 .run = raid0_run,
727 .stop = raid0_stop, 716 .free = raid0_free,
728 .status = raid0_status, 717 .status = raid0_status,
729 .size = raid0_size, 718 .size = raid0_size,
730 .takeover = raid0_takeover, 719 .takeover = raid0_takeover,
731 .quiesce = raid0_quiesce, 720 .quiesce = raid0_quiesce,
721 .congested = raid0_congested,
722 .mergeable_bvec = raid0_mergeable_bvec,
732}; 723};
733 724
734static int __init raid0_init (void) 725static int __init raid0_init (void)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 40b35be34f8d..5dd0c2e59ab9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -701,11 +701,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
701 return best_disk; 701 return best_disk;
702} 702}
703 703
704static int raid1_mergeable_bvec(struct request_queue *q, 704static int raid1_mergeable_bvec(struct mddev *mddev,
705 struct bvec_merge_data *bvm, 705 struct bvec_merge_data *bvm,
706 struct bio_vec *biovec) 706 struct bio_vec *biovec)
707{ 707{
708 struct mddev *mddev = q->queuedata;
709 struct r1conf *conf = mddev->private; 708 struct r1conf *conf = mddev->private;
710 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 709 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
711 int max = biovec->bv_len; 710 int max = biovec->bv_len;
@@ -734,7 +733,7 @@ static int raid1_mergeable_bvec(struct request_queue *q,
734 733
735} 734}
736 735
737int md_raid1_congested(struct mddev *mddev, int bits) 736static int raid1_congested(struct mddev *mddev, int bits)
738{ 737{
739 struct r1conf *conf = mddev->private; 738 struct r1conf *conf = mddev->private;
740 int i, ret = 0; 739 int i, ret = 0;
@@ -763,15 +762,6 @@ int md_raid1_congested(struct mddev *mddev, int bits)
763 rcu_read_unlock(); 762 rcu_read_unlock();
764 return ret; 763 return ret;
765} 764}
766EXPORT_SYMBOL_GPL(md_raid1_congested);
767
768static int raid1_congested(void *data, int bits)
769{
770 struct mddev *mddev = data;
771
772 return mddev_congested(mddev, bits) ||
773 md_raid1_congested(mddev, bits);
774}
775 765
776static void flush_pending_writes(struct r1conf *conf) 766static void flush_pending_writes(struct r1conf *conf)
777{ 767{
@@ -2882,7 +2872,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2882 return ERR_PTR(err); 2872 return ERR_PTR(err);
2883} 2873}
2884 2874
2885static int stop(struct mddev *mddev); 2875static void raid1_free(struct mddev *mddev, void *priv);
2886static int run(struct mddev *mddev) 2876static int run(struct mddev *mddev)
2887{ 2877{
2888 struct r1conf *conf; 2878 struct r1conf *conf;
@@ -2904,7 +2894,7 @@ static int run(struct mddev *mddev)
2904 /* 2894 /*
2905 * copy the already verified devices into our private RAID1 2895 * copy the already verified devices into our private RAID1
2906 * bookkeeping area. [whatever we allocate in run(), 2896 * bookkeeping area. [whatever we allocate in run(),
2907 * should be freed in stop()] 2897 * should be freed in raid1_free()]
2908 */ 2898 */
2909 if (mddev->private == NULL) 2899 if (mddev->private == NULL)
2910 conf = setup_conf(mddev); 2900 conf = setup_conf(mddev);
@@ -2955,10 +2945,6 @@ static int run(struct mddev *mddev)
2955 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 2945 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2956 2946
2957 if (mddev->queue) { 2947 if (mddev->queue) {
2958 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2959 mddev->queue->backing_dev_info.congested_data = mddev;
2960 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
2961
2962 if (discard_supported) 2948 if (discard_supported)
2963 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 2949 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
2964 mddev->queue); 2950 mddev->queue);
@@ -2968,37 +2954,23 @@ static int run(struct mddev *mddev)
2968 } 2954 }
2969 2955
2970 ret = md_integrity_register(mddev); 2956 ret = md_integrity_register(mddev);
2971 if (ret) 2957 if (ret) {
2972 stop(mddev); 2958 md_unregister_thread(&mddev->thread);
2959 raid1_free(mddev, conf);
2960 }
2973 return ret; 2961 return ret;
2974} 2962}
2975 2963
2976static int stop(struct mddev *mddev) 2964static void raid1_free(struct mddev *mddev, void *priv)
2977{ 2965{
2978 struct r1conf *conf = mddev->private; 2966 struct r1conf *conf = priv;
2979 struct bitmap *bitmap = mddev->bitmap;
2980 2967
2981 /* wait for behind writes to complete */
2982 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
2983 printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
2984 mdname(mddev));
2985 /* need to kick something here to make sure I/O goes? */
2986 wait_event(bitmap->behind_wait,
2987 atomic_read(&bitmap->behind_writes) == 0);
2988 }
2989
2990 freeze_array(conf, 0);
2991 unfreeze_array(conf);
2992
2993 md_unregister_thread(&mddev->thread);
2994 if (conf->r1bio_pool) 2968 if (conf->r1bio_pool)
2995 mempool_destroy(conf->r1bio_pool); 2969 mempool_destroy(conf->r1bio_pool);
2996 kfree(conf->mirrors); 2970 kfree(conf->mirrors);
2997 safe_put_page(conf->tmppage); 2971 safe_put_page(conf->tmppage);
2998 kfree(conf->poolinfo); 2972 kfree(conf->poolinfo);
2999 kfree(conf); 2973 kfree(conf);
3000 mddev->private = NULL;
3001 return 0;
3002} 2974}
3003 2975
3004static int raid1_resize(struct mddev *mddev, sector_t sectors) 2976static int raid1_resize(struct mddev *mddev, sector_t sectors)
@@ -3181,7 +3153,7 @@ static struct md_personality raid1_personality =
3181 .owner = THIS_MODULE, 3153 .owner = THIS_MODULE,
3182 .make_request = make_request, 3154 .make_request = make_request,
3183 .run = run, 3155 .run = run,
3184 .stop = stop, 3156 .free = raid1_free,
3185 .status = status, 3157 .status = status,
3186 .error_handler = error, 3158 .error_handler = error,
3187 .hot_add_disk = raid1_add_disk, 3159 .hot_add_disk = raid1_add_disk,
@@ -3193,6 +3165,8 @@ static struct md_personality raid1_personality =
3193 .check_reshape = raid1_reshape, 3165 .check_reshape = raid1_reshape,
3194 .quiesce = raid1_quiesce, 3166 .quiesce = raid1_quiesce,
3195 .takeover = raid1_takeover, 3167 .takeover = raid1_takeover,
3168 .congested = raid1_congested,
3169 .mergeable_bvec = raid1_mergeable_bvec,
3196}; 3170};
3197 3171
3198static int __init raid_init(void) 3172static int __init raid_init(void)
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 33bda55ef9f7..14ebb288c1ef 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -170,7 +170,4 @@ struct r1bio {
170 */ 170 */
171#define R1BIO_MadeGood 7 171#define R1BIO_MadeGood 7
172#define R1BIO_WriteError 8 172#define R1BIO_WriteError 8
173
174extern int md_raid1_congested(struct mddev *mddev, int bits);
175
176#endif 173#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 32e282f4c83c..b8d76b1fba64 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -674,7 +674,7 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
674 674
675/** 675/**
676 * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged 676 * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
677 * @q: request queue 677 * @mddev: the md device
678 * @bvm: properties of new bio 678 * @bvm: properties of new bio
679 * @biovec: the request that could be merged to it. 679 * @biovec: the request that could be merged to it.
680 * 680 *
@@ -682,11 +682,10 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
682 * This requires checking for end-of-chunk if near_copies != raid_disks, 682 * This requires checking for end-of-chunk if near_copies != raid_disks,
683 * and for subordinate merge_bvec_fns if merge_check_needed. 683 * and for subordinate merge_bvec_fns if merge_check_needed.
684 */ 684 */
685static int raid10_mergeable_bvec(struct request_queue *q, 685static int raid10_mergeable_bvec(struct mddev *mddev,
686 struct bvec_merge_data *bvm, 686 struct bvec_merge_data *bvm,
687 struct bio_vec *biovec) 687 struct bio_vec *biovec)
688{ 688{
689 struct mddev *mddev = q->queuedata;
690 struct r10conf *conf = mddev->private; 689 struct r10conf *conf = mddev->private;
691 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 690 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
692 int max; 691 int max;
@@ -910,7 +909,7 @@ retry:
910 return rdev; 909 return rdev;
911} 910}
912 911
913int md_raid10_congested(struct mddev *mddev, int bits) 912static int raid10_congested(struct mddev *mddev, int bits)
914{ 913{
915 struct r10conf *conf = mddev->private; 914 struct r10conf *conf = mddev->private;
916 int i, ret = 0; 915 int i, ret = 0;
@@ -934,15 +933,6 @@ int md_raid10_congested(struct mddev *mddev, int bits)
934 rcu_read_unlock(); 933 rcu_read_unlock();
935 return ret; 934 return ret;
936} 935}
937EXPORT_SYMBOL_GPL(md_raid10_congested);
938
939static int raid10_congested(void *data, int bits)
940{
941 struct mddev *mddev = data;
942
943 return mddev_congested(mddev, bits) ||
944 md_raid10_congested(mddev, bits);
945}
946 936
947static void flush_pending_writes(struct r10conf *conf) 937static void flush_pending_writes(struct r10conf *conf)
948{ 938{
@@ -3757,8 +3747,6 @@ static int run(struct mddev *mddev)
3757 if (mddev->queue) { 3747 if (mddev->queue) {
3758 int stripe = conf->geo.raid_disks * 3748 int stripe = conf->geo.raid_disks *
3759 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3749 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3760 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3761 mddev->queue->backing_dev_info.congested_data = mddev;
3762 3750
3763 /* Calculate max read-ahead size. 3751 /* Calculate max read-ahead size.
3764 * We need to readahead at least twice a whole stripe.... 3752 * We need to readahead at least twice a whole stripe....
@@ -3767,7 +3755,6 @@ static int run(struct mddev *mddev)
3767 stripe /= conf->geo.near_copies; 3755 stripe /= conf->geo.near_copies;
3768 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3756 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3769 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3757 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3770 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3771 } 3758 }
3772 3759
3773 if (md_integrity_register(mddev)) 3760 if (md_integrity_register(mddev))
@@ -3811,17 +3798,9 @@ out:
3811 return -EIO; 3798 return -EIO;
3812} 3799}
3813 3800
3814static int stop(struct mddev *mddev) 3801static void raid10_free(struct mddev *mddev, void *priv)
3815{ 3802{
3816 struct r10conf *conf = mddev->private; 3803 struct r10conf *conf = priv;
3817
3818 raise_barrier(conf, 0);
3819 lower_barrier(conf);
3820
3821 md_unregister_thread(&mddev->thread);
3822 if (mddev->queue)
3823 /* the unplug fn references 'conf'*/
3824 blk_sync_queue(mddev->queue);
3825 3804
3826 if (conf->r10bio_pool) 3805 if (conf->r10bio_pool)
3827 mempool_destroy(conf->r10bio_pool); 3806 mempool_destroy(conf->r10bio_pool);
@@ -3830,8 +3809,6 @@ static int stop(struct mddev *mddev)
3830 kfree(conf->mirrors_old); 3809 kfree(conf->mirrors_old);
3831 kfree(conf->mirrors_new); 3810 kfree(conf->mirrors_new);
3832 kfree(conf); 3811 kfree(conf);
3833 mddev->private = NULL;
3834 return 0;
3835} 3812}
3836 3813
3837static void raid10_quiesce(struct mddev *mddev, int state) 3814static void raid10_quiesce(struct mddev *mddev, int state)
@@ -3895,7 +3872,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
3895 return 0; 3872 return 0;
3896} 3873}
3897 3874
3898static void *raid10_takeover_raid0(struct mddev *mddev) 3875static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
3899{ 3876{
3900 struct md_rdev *rdev; 3877 struct md_rdev *rdev;
3901 struct r10conf *conf; 3878 struct r10conf *conf;
@@ -3905,6 +3882,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
3905 mdname(mddev)); 3882 mdname(mddev));
3906 return ERR_PTR(-EINVAL); 3883 return ERR_PTR(-EINVAL);
3907 } 3884 }
3885 sector_div(size, devs);
3908 3886
3909 /* Set new parameters */ 3887 /* Set new parameters */
3910 mddev->new_level = 10; 3888 mddev->new_level = 10;
@@ -3915,12 +3893,15 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
3915 mddev->raid_disks *= 2; 3893 mddev->raid_disks *= 2;
3916 /* make sure it will be not marked as dirty */ 3894 /* make sure it will be not marked as dirty */
3917 mddev->recovery_cp = MaxSector; 3895 mddev->recovery_cp = MaxSector;
3896 mddev->dev_sectors = size;
3918 3897
3919 conf = setup_conf(mddev); 3898 conf = setup_conf(mddev);
3920 if (!IS_ERR(conf)) { 3899 if (!IS_ERR(conf)) {
3921 rdev_for_each(rdev, mddev) 3900 rdev_for_each(rdev, mddev)
3922 if (rdev->raid_disk >= 0) 3901 if (rdev->raid_disk >= 0) {
3923 rdev->new_raid_disk = rdev->raid_disk * 2; 3902 rdev->new_raid_disk = rdev->raid_disk * 2;
3903 rdev->sectors = size;
3904 }
3924 conf->barrier = 1; 3905 conf->barrier = 1;
3925 } 3906 }
3926 3907
@@ -3943,7 +3924,9 @@ static void *raid10_takeover(struct mddev *mddev)
3943 mdname(mddev)); 3924 mdname(mddev));
3944 return ERR_PTR(-EINVAL); 3925 return ERR_PTR(-EINVAL);
3945 } 3926 }
3946 return raid10_takeover_raid0(mddev); 3927 return raid10_takeover_raid0(mddev,
3928 raid0_conf->strip_zone->zone_end,
3929 raid0_conf->strip_zone->nb_dev);
3947 } 3930 }
3948 return ERR_PTR(-EINVAL); 3931 return ERR_PTR(-EINVAL);
3949} 3932}
@@ -4713,7 +4696,7 @@ static struct md_personality raid10_personality =
4713 .owner = THIS_MODULE, 4696 .owner = THIS_MODULE,
4714 .make_request = make_request, 4697 .make_request = make_request,
4715 .run = run, 4698 .run = run,
4716 .stop = stop, 4699 .free = raid10_free,
4717 .status = status, 4700 .status = status,
4718 .error_handler = error, 4701 .error_handler = error,
4719 .hot_add_disk = raid10_add_disk, 4702 .hot_add_disk = raid10_add_disk,
@@ -4727,6 +4710,8 @@ static struct md_personality raid10_personality =
4727 .check_reshape = raid10_check_reshape, 4710 .check_reshape = raid10_check_reshape,
4728 .start_reshape = raid10_start_reshape, 4711 .start_reshape = raid10_start_reshape,
4729 .finish_reshape = raid10_finish_reshape, 4712 .finish_reshape = raid10_finish_reshape,
4713 .congested = raid10_congested,
4714 .mergeable_bvec = raid10_mergeable_bvec,
4730}; 4715};
4731 4716
4732static int __init raid_init(void) 4717static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 157d69e83ff4..5ee6473ddc2c 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -150,7 +150,4 @@ enum r10bio_state {
150 */ 150 */
151 R10BIO_Previous, 151 R10BIO_Previous,
152}; 152};
153
154extern int md_raid10_congested(struct mddev *mddev, int bits);
155
156#endif 153#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b98765f6f77f..aa76865b804b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
296 BUG_ON(atomic_read(&conf->active_stripes)==0); 296 BUG_ON(atomic_read(&conf->active_stripes)==0);
297 if (test_bit(STRIPE_HANDLE, &sh->state)) { 297 if (test_bit(STRIPE_HANDLE, &sh->state)) {
298 if (test_bit(STRIPE_DELAYED, &sh->state) && 298 if (test_bit(STRIPE_DELAYED, &sh->state) &&
299 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 299 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
300 list_add_tail(&sh->lru, &conf->delayed_list); 300 list_add_tail(&sh->lru, &conf->delayed_list);
301 if (atomic_read(&conf->preread_active_stripes) 301 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
302 < IO_THRESHOLD)
303 md_wakeup_thread(conf->mddev->thread);
304 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
305 sh->bm_seq - conf->seq_write > 0) 302 sh->bm_seq - conf->seq_write > 0)
306 list_add_tail(&sh->lru, &conf->bitmap_list); 303 list_add_tail(&sh->lru, &conf->bitmap_list);
307 else { 304 else {
@@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
2898 * Returns 1 when no more member devices need to be checked, otherwise returns 2895 * Returns 1 when no more member devices need to be checked, otherwise returns
2899 * 0 to tell the loop in handle_stripe_fill to continue 2896 * 0 to tell the loop in handle_stripe_fill to continue
2900 */ 2897 */
2901static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2898
2902 int disk_idx, int disks) 2899static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
2900 int disk_idx, int disks)
2903{ 2901{
2904 struct r5dev *dev = &sh->dev[disk_idx]; 2902 struct r5dev *dev = &sh->dev[disk_idx];
2905 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2903 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2906 &sh->dev[s->failed_num[1]] }; 2904 &sh->dev[s->failed_num[1]] };
2905 int i;
2906
2907
2908 if (test_bit(R5_LOCKED, &dev->flags) ||
2909 test_bit(R5_UPTODATE, &dev->flags))
2910 /* No point reading this as we already have it or have
2911 * decided to get it.
2912 */
2913 return 0;
2914
2915 if (dev->toread ||
2916 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
2917 /* We need this block to directly satisfy a request */
2918 return 1;
2919
2920 if (s->syncing || s->expanding ||
2921 (s->replacing && want_replace(sh, disk_idx)))
2922 /* When syncing, or expanding we read everything.
2923 * When replacing, we need the replaced block.
2924 */
2925 return 1;
2926
2927 if ((s->failed >= 1 && fdev[0]->toread) ||
2928 (s->failed >= 2 && fdev[1]->toread))
2929 /* If we want to read from a failed device, then
2930 * we need to actually read every other device.
2931 */
2932 return 1;
2933
2934 /* Sometimes neither read-modify-write nor reconstruct-write
2935 * cycles can work. In those cases we read every block we
2936 * can. Then the parity-update is certain to have enough to
2937 * work with.
2938 * This can only be a problem when we need to write something,
2939 * and some device has failed. If either of those tests
2940 * fail we need look no further.
2941 */
2942 if (!s->failed || !s->to_write)
2943 return 0;
2944
2945 if (test_bit(R5_Insync, &dev->flags) &&
2946 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
2947 /* Pre-reads at not permitted until after short delay
2948 * to gather multiple requests. However if this
2949 * device is no Insync, the block could only be be computed
2950 * and there is no need to delay that.
2951 */
2952 return 0;
2953
2954 for (i = 0; i < s->failed; i++) {
2955 if (fdev[i]->towrite &&
2956 !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
2957 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
2958 /* If we have a partial write to a failed
2959 * device, then we will need to reconstruct
2960 * the content of that device, so all other
2961 * devices must be read.
2962 */
2963 return 1;
2964 }
2965
2966 /* If we are forced to do a reconstruct-write, either because
2967 * the current RAID6 implementation only supports that, or
2968 * or because parity cannot be trusted and we are currently
2969 * recovering it, there is extra need to be careful.
2970 * If one of the devices that we would need to read, because
2971 * it is not being overwritten (and maybe not written at all)
2972 * is missing/faulty, then we need to read everything we can.
2973 */
2974 if (sh->raid_conf->level != 6 &&
2975 sh->sector < sh->raid_conf->mddev->recovery_cp)
2976 /* reconstruct-write isn't being forced */
2977 return 0;
2978 for (i = 0; i < s->failed; i++) {
2979 if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
2980 !test_bit(R5_OVERWRITE, &fdev[i]->flags))
2981 return 1;
2982 }
2983
2984 return 0;
2985}
2986
2987static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2988 int disk_idx, int disks)
2989{
2990 struct r5dev *dev = &sh->dev[disk_idx];
2907 2991
2908 /* is the data in this block needed, and can we get it? */ 2992 /* is the data in this block needed, and can we get it? */
2909 if (!test_bit(R5_LOCKED, &dev->flags) && 2993 if (need_this_block(sh, s, disk_idx, disks)) {
2910 !test_bit(R5_UPTODATE, &dev->flags) &&
2911 (dev->toread ||
2912 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2913 s->syncing || s->expanding ||
2914 (s->replacing && want_replace(sh, disk_idx)) ||
2915 (s->failed >= 1 && fdev[0]->toread) ||
2916 (s->failed >= 2 && fdev[1]->toread) ||
2917 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2918 (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
2919 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2920 ((sh->raid_conf->level == 6 ||
2921 sh->sector >= sh->raid_conf->mddev->recovery_cp)
2922 && s->failed && s->to_write &&
2923 (s->to_write - s->non_overwrite <
2924 sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) &&
2925 (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
2926 /* we would like to get this block, possibly by computing it, 2994 /* we would like to get this block, possibly by computing it,
2927 * otherwise read it if the backing disk is insync 2995 * otherwise read it if the backing disk is insync
2928 */ 2996 */
@@ -4081,7 +4149,7 @@ static void activate_bit_delay(struct r5conf *conf,
4081 } 4149 }
4082} 4150}
4083 4151
4084int md_raid5_congested(struct mddev *mddev, int bits) 4152static int raid5_congested(struct mddev *mddev, int bits)
4085{ 4153{
4086 struct r5conf *conf = mddev->private; 4154 struct r5conf *conf = mddev->private;
4087 4155
@@ -4098,24 +4166,14 @@ int md_raid5_congested(struct mddev *mddev, int bits)
4098 4166
4099 return 0; 4167 return 0;
4100} 4168}
4101EXPORT_SYMBOL_GPL(md_raid5_congested);
4102
4103static int raid5_congested(void *data, int bits)
4104{
4105 struct mddev *mddev = data;
4106
4107 return mddev_congested(mddev, bits) ||
4108 md_raid5_congested(mddev, bits);
4109}
4110 4169
4111/* We want read requests to align with chunks where possible, 4170/* We want read requests to align with chunks where possible,
4112 * but write requests don't need to. 4171 * but write requests don't need to.
4113 */ 4172 */
4114static int raid5_mergeable_bvec(struct request_queue *q, 4173static int raid5_mergeable_bvec(struct mddev *mddev,
4115 struct bvec_merge_data *bvm, 4174 struct bvec_merge_data *bvm,
4116 struct bio_vec *biovec) 4175 struct bio_vec *biovec)
4117{ 4176{
4118 struct mddev *mddev = q->queuedata;
4119 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 4177 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
4120 int max; 4178 int max;
4121 unsigned int chunk_sectors = mddev->chunk_sectors; 4179 unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -5296,11 +5354,14 @@ static void raid5d(struct md_thread *thread)
5296static ssize_t 5354static ssize_t
5297raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 5355raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
5298{ 5356{
5299 struct r5conf *conf = mddev->private; 5357 struct r5conf *conf;
5358 int ret = 0;
5359 spin_lock(&mddev->lock);
5360 conf = mddev->private;
5300 if (conf) 5361 if (conf)
5301 return sprintf(page, "%d\n", conf->max_nr_stripes); 5362 ret = sprintf(page, "%d\n", conf->max_nr_stripes);
5302 else 5363 spin_unlock(&mddev->lock);
5303 return 0; 5364 return ret;
5304} 5365}
5305 5366
5306int 5367int
@@ -5339,21 +5400,25 @@ EXPORT_SYMBOL(raid5_set_cache_size);
5339static ssize_t 5400static ssize_t
5340raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 5401raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
5341{ 5402{
5342 struct r5conf *conf = mddev->private; 5403 struct r5conf *conf;
5343 unsigned long new; 5404 unsigned long new;
5344 int err; 5405 int err;
5345 5406
5346 if (len >= PAGE_SIZE) 5407 if (len >= PAGE_SIZE)
5347 return -EINVAL; 5408 return -EINVAL;
5348 if (!conf)
5349 return -ENODEV;
5350
5351 if (kstrtoul(page, 10, &new)) 5409 if (kstrtoul(page, 10, &new))
5352 return -EINVAL; 5410 return -EINVAL;
5353 err = raid5_set_cache_size(mddev, new); 5411 err = mddev_lock(mddev);
5354 if (err) 5412 if (err)
5355 return err; 5413 return err;
5356 return len; 5414 conf = mddev->private;
5415 if (!conf)
5416 err = -ENODEV;
5417 else
5418 err = raid5_set_cache_size(mddev, new);
5419 mddev_unlock(mddev);
5420
5421 return err ?: len;
5357} 5422}
5358 5423
5359static struct md_sysfs_entry 5424static struct md_sysfs_entry
@@ -5364,29 +5429,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
5364static ssize_t 5429static ssize_t
5365raid5_show_preread_threshold(struct mddev *mddev, char *page) 5430raid5_show_preread_threshold(struct mddev *mddev, char *page)
5366{ 5431{
5367 struct r5conf *conf = mddev->private; 5432 struct r5conf *conf;
5433 int ret = 0;
5434 spin_lock(&mddev->lock);
5435 conf = mddev->private;
5368 if (conf) 5436 if (conf)
5369 return sprintf(page, "%d\n", conf->bypass_threshold); 5437 ret = sprintf(page, "%d\n", conf->bypass_threshold);
5370 else 5438 spin_unlock(&mddev->lock);
5371 return 0; 5439 return ret;
5372} 5440}
5373 5441
5374static ssize_t 5442static ssize_t
5375raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 5443raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
5376{ 5444{
5377 struct r5conf *conf = mddev->private; 5445 struct r5conf *conf;
5378 unsigned long new; 5446 unsigned long new;
5447 int err;
5448
5379 if (len >= PAGE_SIZE) 5449 if (len >= PAGE_SIZE)
5380 return -EINVAL; 5450 return -EINVAL;
5381 if (!conf)
5382 return -ENODEV;
5383
5384 if (kstrtoul(page, 10, &new)) 5451 if (kstrtoul(page, 10, &new))
5385 return -EINVAL; 5452 return -EINVAL;
5386 if (new > conf->max_nr_stripes) 5453
5387 return -EINVAL; 5454 err = mddev_lock(mddev);
5388 conf->bypass_threshold = new; 5455 if (err)
5389 return len; 5456 return err;
5457 conf = mddev->private;
5458 if (!conf)
5459 err = -ENODEV;
5460 else if (new > conf->max_nr_stripes)
5461 err = -EINVAL;
5462 else
5463 conf->bypass_threshold = new;
5464 mddev_unlock(mddev);
5465 return err ?: len;
5390} 5466}
5391 5467
5392static struct md_sysfs_entry 5468static struct md_sysfs_entry
@@ -5398,39 +5474,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
5398static ssize_t 5474static ssize_t
5399raid5_show_skip_copy(struct mddev *mddev, char *page) 5475raid5_show_skip_copy(struct mddev *mddev, char *page)
5400{ 5476{
5401 struct r5conf *conf = mddev->private; 5477 struct r5conf *conf;
5478 int ret = 0;
5479 spin_lock(&mddev->lock);
5480 conf = mddev->private;
5402 if (conf) 5481 if (conf)
5403 return sprintf(page, "%d\n", conf->skip_copy); 5482 ret = sprintf(page, "%d\n", conf->skip_copy);
5404 else 5483 spin_unlock(&mddev->lock);
5405 return 0; 5484 return ret;
5406} 5485}
5407 5486
5408static ssize_t 5487static ssize_t
5409raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 5488raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
5410{ 5489{
5411 struct r5conf *conf = mddev->private; 5490 struct r5conf *conf;
5412 unsigned long new; 5491 unsigned long new;
5492 int err;
5493
5413 if (len >= PAGE_SIZE) 5494 if (len >= PAGE_SIZE)
5414 return -EINVAL; 5495 return -EINVAL;
5415 if (!conf)
5416 return -ENODEV;
5417
5418 if (kstrtoul(page, 10, &new)) 5496 if (kstrtoul(page, 10, &new))
5419 return -EINVAL; 5497 return -EINVAL;
5420 new = !!new; 5498 new = !!new;
5421 if (new == conf->skip_copy)
5422 return len;
5423 5499
5424 mddev_suspend(mddev); 5500 err = mddev_lock(mddev);
5425 conf->skip_copy = new; 5501 if (err)
5426 if (new) 5502 return err;
5427 mddev->queue->backing_dev_info.capabilities |= 5503 conf = mddev->private;
5428 BDI_CAP_STABLE_WRITES; 5504 if (!conf)
5429 else 5505 err = -ENODEV;
5430 mddev->queue->backing_dev_info.capabilities &= 5506 else if (new != conf->skip_copy) {
5431 ~BDI_CAP_STABLE_WRITES; 5507 mddev_suspend(mddev);
5432 mddev_resume(mddev); 5508 conf->skip_copy = new;
5433 return len; 5509 if (new)
5510 mddev->queue->backing_dev_info.capabilities |=
5511 BDI_CAP_STABLE_WRITES;
5512 else
5513 mddev->queue->backing_dev_info.capabilities &=
5514 ~BDI_CAP_STABLE_WRITES;
5515 mddev_resume(mddev);
5516 }
5517 mddev_unlock(mddev);
5518 return err ?: len;
5434} 5519}
5435 5520
5436static struct md_sysfs_entry 5521static struct md_sysfs_entry
@@ -5454,11 +5539,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
5454static ssize_t 5539static ssize_t
5455raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 5540raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
5456{ 5541{
5457 struct r5conf *conf = mddev->private; 5542 struct r5conf *conf;
5543 int ret = 0;
5544 spin_lock(&mddev->lock);
5545 conf = mddev->private;
5458 if (conf) 5546 if (conf)
5459 return sprintf(page, "%d\n", conf->worker_cnt_per_group); 5547 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
5460 else 5548 spin_unlock(&mddev->lock);
5461 return 0; 5549 return ret;
5462} 5550}
5463 5551
5464static int alloc_thread_groups(struct r5conf *conf, int cnt, 5552static int alloc_thread_groups(struct r5conf *conf, int cnt,
@@ -5468,7 +5556,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
5468static ssize_t 5556static ssize_t
5469raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 5557raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
5470{ 5558{
5471 struct r5conf *conf = mddev->private; 5559 struct r5conf *conf;
5472 unsigned long new; 5560 unsigned long new;
5473 int err; 5561 int err;
5474 struct r5worker_group *new_groups, *old_groups; 5562 struct r5worker_group *new_groups, *old_groups;
@@ -5476,41 +5564,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
5476 5564
5477 if (len >= PAGE_SIZE) 5565 if (len >= PAGE_SIZE)
5478 return -EINVAL; 5566 return -EINVAL;
5479 if (!conf)
5480 return -ENODEV;
5481
5482 if (kstrtoul(page, 10, &new)) 5567 if (kstrtoul(page, 10, &new))
5483 return -EINVAL; 5568 return -EINVAL;
5484 5569
5485 if (new == conf->worker_cnt_per_group) 5570 err = mddev_lock(mddev);
5486 return len; 5571 if (err)
5487 5572 return err;
5488 mddev_suspend(mddev); 5573 conf = mddev->private;
5574 if (!conf)
5575 err = -ENODEV;
5576 else if (new != conf->worker_cnt_per_group) {
5577 mddev_suspend(mddev);
5489 5578
5490 old_groups = conf->worker_groups; 5579 old_groups = conf->worker_groups;
5491 if (old_groups) 5580 if (old_groups)
5492 flush_workqueue(raid5_wq); 5581 flush_workqueue(raid5_wq);
5493 5582
5494 err = alloc_thread_groups(conf, new, 5583 err = alloc_thread_groups(conf, new,
5495 &group_cnt, &worker_cnt_per_group, 5584 &group_cnt, &worker_cnt_per_group,
5496 &new_groups); 5585 &new_groups);
5497 if (!err) { 5586 if (!err) {
5498 spin_lock_irq(&conf->device_lock); 5587 spin_lock_irq(&conf->device_lock);
5499 conf->group_cnt = group_cnt; 5588 conf->group_cnt = group_cnt;
5500 conf->worker_cnt_per_group = worker_cnt_per_group; 5589 conf->worker_cnt_per_group = worker_cnt_per_group;
5501 conf->worker_groups = new_groups; 5590 conf->worker_groups = new_groups;
5502 spin_unlock_irq(&conf->device_lock); 5591 spin_unlock_irq(&conf->device_lock);
5503 5592
5504 if (old_groups) 5593 if (old_groups)
5505 kfree(old_groups[0].workers); 5594 kfree(old_groups[0].workers);
5506 kfree(old_groups); 5595 kfree(old_groups);
5596 }
5597 mddev_resume(mddev);
5507 } 5598 }
5599 mddev_unlock(mddev);
5508 5600
5509 mddev_resume(mddev); 5601 return err ?: len;
5510
5511 if (err)
5512 return err;
5513 return len;
5514} 5602}
5515 5603
5516static struct md_sysfs_entry 5604static struct md_sysfs_entry
@@ -6178,11 +6266,6 @@ static int run(struct mddev *mddev)
6178 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6266 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
6179 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6267 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
6180 6268
6181 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
6182
6183 mddev->queue->backing_dev_info.congested_data = mddev;
6184 mddev->queue->backing_dev_info.congested_fn = raid5_congested;
6185
6186 chunk_size = mddev->chunk_sectors << 9; 6269 chunk_size = mddev->chunk_sectors << 9;
6187 blk_queue_io_min(mddev->queue, chunk_size); 6270 blk_queue_io_min(mddev->queue, chunk_size);
6188 blk_queue_io_opt(mddev->queue, chunk_size * 6271 blk_queue_io_opt(mddev->queue, chunk_size *
@@ -6260,17 +6343,12 @@ abort:
6260 return -EIO; 6343 return -EIO;
6261} 6344}
6262 6345
6263static int stop(struct mddev *mddev) 6346static void raid5_free(struct mddev *mddev, void *priv)
6264{ 6347{
6265 struct r5conf *conf = mddev->private; 6348 struct r5conf *conf = priv;
6266 6349
6267 md_unregister_thread(&mddev->thread);
6268 if (mddev->queue)
6269 mddev->queue->backing_dev_info.congested_fn = NULL;
6270 free_conf(conf); 6350 free_conf(conf);
6271 mddev->private = NULL;
6272 mddev->to_remove = &raid5_attrs_group; 6351 mddev->to_remove = &raid5_attrs_group;
6273 return 0;
6274} 6352}
6275 6353
6276static void status(struct seq_file *seq, struct mddev *mddev) 6354static void status(struct seq_file *seq, struct mddev *mddev)
@@ -7044,7 +7122,7 @@ static struct md_personality raid6_personality =
7044 .owner = THIS_MODULE, 7122 .owner = THIS_MODULE,
7045 .make_request = make_request, 7123 .make_request = make_request,
7046 .run = run, 7124 .run = run,
7047 .stop = stop, 7125 .free = raid5_free,
7048 .status = status, 7126 .status = status,
7049 .error_handler = error, 7127 .error_handler = error,
7050 .hot_add_disk = raid5_add_disk, 7128 .hot_add_disk = raid5_add_disk,
@@ -7058,6 +7136,8 @@ static struct md_personality raid6_personality =
7058 .finish_reshape = raid5_finish_reshape, 7136 .finish_reshape = raid5_finish_reshape,
7059 .quiesce = raid5_quiesce, 7137 .quiesce = raid5_quiesce,
7060 .takeover = raid6_takeover, 7138 .takeover = raid6_takeover,
7139 .congested = raid5_congested,
7140 .mergeable_bvec = raid5_mergeable_bvec,
7061}; 7141};
7062static struct md_personality raid5_personality = 7142static struct md_personality raid5_personality =
7063{ 7143{
@@ -7066,7 +7146,7 @@ static struct md_personality raid5_personality =
7066 .owner = THIS_MODULE, 7146 .owner = THIS_MODULE,
7067 .make_request = make_request, 7147 .make_request = make_request,
7068 .run = run, 7148 .run = run,
7069 .stop = stop, 7149 .free = raid5_free,
7070 .status = status, 7150 .status = status,
7071 .error_handler = error, 7151 .error_handler = error,
7072 .hot_add_disk = raid5_add_disk, 7152 .hot_add_disk = raid5_add_disk,
@@ -7080,6 +7160,8 @@ static struct md_personality raid5_personality =
7080 .finish_reshape = raid5_finish_reshape, 7160 .finish_reshape = raid5_finish_reshape,
7081 .quiesce = raid5_quiesce, 7161 .quiesce = raid5_quiesce,
7082 .takeover = raid5_takeover, 7162 .takeover = raid5_takeover,
7163 .congested = raid5_congested,
7164 .mergeable_bvec = raid5_mergeable_bvec,
7083}; 7165};
7084 7166
7085static struct md_personality raid4_personality = 7167static struct md_personality raid4_personality =
@@ -7089,7 +7171,7 @@ static struct md_personality raid4_personality =
7089 .owner = THIS_MODULE, 7171 .owner = THIS_MODULE,
7090 .make_request = make_request, 7172 .make_request = make_request,
7091 .run = run, 7173 .run = run,
7092 .stop = stop, 7174 .free = raid5_free,
7093 .status = status, 7175 .status = status,
7094 .error_handler = error, 7176 .error_handler = error,
7095 .hot_add_disk = raid5_add_disk, 7177 .hot_add_disk = raid5_add_disk,
@@ -7103,6 +7185,8 @@ static struct md_personality raid4_personality =
7103 .finish_reshape = raid5_finish_reshape, 7185 .finish_reshape = raid5_finish_reshape,
7104 .quiesce = raid5_quiesce, 7186 .quiesce = raid5_quiesce,
7105 .takeover = raid4_takeover, 7187 .takeover = raid4_takeover,
7188 .congested = raid5_congested,
7189 .mergeable_bvec = raid5_mergeable_bvec,
7106}; 7190};
7107 7191
7108static int __init raid5_init(void) 7192static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d59f5ca743cd..983e18a83db1 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -558,7 +558,6 @@ static inline int algorithm_is_DDF(int layout)
558 return layout >= 8 && layout <= 10; 558 return layout >= 8 && layout <= 10;
559} 559}
560 560
561extern int md_raid5_congested(struct mddev *mddev, int bits);
562extern void md_raid5_kick_device(struct r5conf *conf); 561extern void md_raid5_kick_device(struct r5conf *conf);
563extern int raid5_set_cache_size(struct mddev *mddev, int size); 562extern int raid5_set_cache_size(struct mddev *mddev, int size);
564#endif 563#endif