aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c787
1 files changed, 461 insertions, 326 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f20d13e717d5..91e31e260b4a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -36,7 +36,7 @@
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/sysctl.h> 37#include <linux/sysctl.h>
38#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39#include <linux/smp_lock.h> 39#include <linux/mutex.h>
40#include <linux/buffer_head.h> /* for invalidate_bdev */ 40#include <linux/buffer_head.h> /* for invalidate_bdev */
41#include <linux/poll.h> 41#include <linux/poll.h>
42#include <linux/ctype.h> 42#include <linux/ctype.h>
@@ -57,7 +57,6 @@
57#define DEBUG 0 57#define DEBUG 0
58#define dprintk(x...) ((void)(DEBUG && printk(x))) 58#define dprintk(x...) ((void)(DEBUG && printk(x)))
59 59
60
61#ifndef MODULE 60#ifndef MODULE
62static void autostart_arrays(int part); 61static void autostart_arrays(int part);
63#endif 62#endif
@@ -68,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock);
68static void md_print_devices(void); 67static void md_print_devices(void);
69 68
70static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 69static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
70static struct workqueue_struct *md_wq;
71static struct workqueue_struct *md_misc_wq;
71 72
72#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 73#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
73 74
@@ -148,6 +149,72 @@ static const struct block_device_operations md_fops;
148 149
149static int start_readonly; 150static int start_readonly;
150 151
152/* bio_clone_mddev
153 * like bio_clone, but with a local bio set
154 */
155
156static void mddev_bio_destructor(struct bio *bio)
157{
158 mddev_t *mddev, **mddevp;
159
160 mddevp = (void*)bio;
161 mddev = mddevp[-1];
162
163 bio_free(bio, mddev->bio_set);
164}
165
166struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
167 mddev_t *mddev)
168{
169 struct bio *b;
170 mddev_t **mddevp;
171
172 if (!mddev || !mddev->bio_set)
173 return bio_alloc(gfp_mask, nr_iovecs);
174
175 b = bio_alloc_bioset(gfp_mask, nr_iovecs,
176 mddev->bio_set);
177 if (!b)
178 return NULL;
179 mddevp = (void*)b;
180 mddevp[-1] = mddev;
181 b->bi_destructor = mddev_bio_destructor;
182 return b;
183}
184EXPORT_SYMBOL_GPL(bio_alloc_mddev);
185
186struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
187 mddev_t *mddev)
188{
189 struct bio *b;
190 mddev_t **mddevp;
191
192 if (!mddev || !mddev->bio_set)
193 return bio_clone(bio, gfp_mask);
194
195 b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
196 mddev->bio_set);
197 if (!b)
198 return NULL;
199 mddevp = (void*)b;
200 mddevp[-1] = mddev;
201 b->bi_destructor = mddev_bio_destructor;
202 __bio_clone(b, bio);
203 if (bio_integrity(bio)) {
204 int ret;
205
206 ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
207
208 if (ret < 0) {
209 bio_put(b);
210 return NULL;
211 }
212 }
213
214 return b;
215}
216EXPORT_SYMBOL_GPL(bio_clone_mddev);
217
151/* 218/*
152 * We have a system wide 'event count' that is incremented 219 * We have a system wide 'event count' that is incremented
153 * on any 'interesting' event, and readers of /proc/mdstat 220 * on any 'interesting' event, and readers of /proc/mdstat
@@ -220,18 +287,21 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
220 mddev_t *mddev = q->queuedata; 287 mddev_t *mddev = q->queuedata;
221 int rv; 288 int rv;
222 int cpu; 289 int cpu;
290 unsigned int sectors;
223 291
224 if (mddev == NULL || mddev->pers == NULL) { 292 if (mddev == NULL || mddev->pers == NULL
293 || !mddev->ready) {
225 bio_io_error(bio); 294 bio_io_error(bio);
226 return 0; 295 return 0;
227 } 296 }
297 smp_rmb(); /* Ensure implications of 'active' are visible */
228 rcu_read_lock(); 298 rcu_read_lock();
229 if (mddev->suspended || mddev->barrier) { 299 if (mddev->suspended) {
230 DEFINE_WAIT(__wait); 300 DEFINE_WAIT(__wait);
231 for (;;) { 301 for (;;) {
232 prepare_to_wait(&mddev->sb_wait, &__wait, 302 prepare_to_wait(&mddev->sb_wait, &__wait,
233 TASK_UNINTERRUPTIBLE); 303 TASK_UNINTERRUPTIBLE);
234 if (!mddev->suspended && !mddev->barrier) 304 if (!mddev->suspended)
235 break; 305 break;
236 rcu_read_unlock(); 306 rcu_read_unlock();
237 schedule(); 307 schedule();
@@ -242,12 +312,16 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
242 atomic_inc(&mddev->active_io); 312 atomic_inc(&mddev->active_io);
243 rcu_read_unlock(); 313 rcu_read_unlock();
244 314
315 /*
316 * save the sectors now since our bio can
317 * go away inside make_request
318 */
319 sectors = bio_sectors(bio);
245 rv = mddev->pers->make_request(mddev, bio); 320 rv = mddev->pers->make_request(mddev, bio);
246 321
247 cpu = part_stat_lock(); 322 cpu = part_stat_lock();
248 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 323 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
249 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 324 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
250 bio_sectors(bio));
251 part_stat_unlock(); 325 part_stat_unlock();
252 326
253 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 327 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
@@ -277,48 +351,45 @@ void mddev_resume(mddev_t *mddev)
277 mddev->suspended = 0; 351 mddev->suspended = 0;
278 wake_up(&mddev->sb_wait); 352 wake_up(&mddev->sb_wait);
279 mddev->pers->quiesce(mddev, 0); 353 mddev->pers->quiesce(mddev, 0);
354
355 md_wakeup_thread(mddev->thread);
356 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
280} 357}
281EXPORT_SYMBOL_GPL(mddev_resume); 358EXPORT_SYMBOL_GPL(mddev_resume);
282 359
283int mddev_congested(mddev_t *mddev, int bits) 360int mddev_congested(mddev_t *mddev, int bits)
284{ 361{
285 if (mddev->barrier)
286 return 1;
287 return mddev->suspended; 362 return mddev->suspended;
288} 363}
289EXPORT_SYMBOL(mddev_congested); 364EXPORT_SYMBOL(mddev_congested);
290 365
291/* 366/*
292 * Generic barrier handling for md 367 * Generic flush handling for md
293 */ 368 */
294 369
295#define POST_REQUEST_BARRIER ((void*)1) 370static void md_end_flush(struct bio *bio, int err)
296
297static void md_end_barrier(struct bio *bio, int err)
298{ 371{
299 mdk_rdev_t *rdev = bio->bi_private; 372 mdk_rdev_t *rdev = bio->bi_private;
300 mddev_t *mddev = rdev->mddev; 373 mddev_t *mddev = rdev->mddev;
301 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
302 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
303 374
304 rdev_dec_pending(rdev, mddev); 375 rdev_dec_pending(rdev, mddev);
305 376
306 if (atomic_dec_and_test(&mddev->flush_pending)) { 377 if (atomic_dec_and_test(&mddev->flush_pending)) {
307 if (mddev->barrier == POST_REQUEST_BARRIER) { 378 /* The pre-request flush has finished */
308 /* This was a post-request barrier */ 379 queue_work(md_wq, &mddev->flush_work);
309 mddev->barrier = NULL;
310 wake_up(&mddev->sb_wait);
311 } else
312 /* The pre-request barrier has finished */
313 schedule_work(&mddev->barrier_work);
314 } 380 }
315 bio_put(bio); 381 bio_put(bio);
316} 382}
317 383
318static void submit_barriers(mddev_t *mddev) 384static void md_submit_flush_data(struct work_struct *ws);
385
386static void submit_flushes(struct work_struct *ws)
319{ 387{
388 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
320 mdk_rdev_t *rdev; 389 mdk_rdev_t *rdev;
321 390
391 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
392 atomic_set(&mddev->flush_pending, 1);
322 rcu_read_lock(); 393 rcu_read_lock();
323 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 394 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
324 if (rdev->raid_disk >= 0 && 395 if (rdev->raid_disk >= 0 &&
@@ -331,106 +402,107 @@ static void submit_barriers(mddev_t *mddev)
331 atomic_inc(&rdev->nr_pending); 402 atomic_inc(&rdev->nr_pending);
332 atomic_inc(&rdev->nr_pending); 403 atomic_inc(&rdev->nr_pending);
333 rcu_read_unlock(); 404 rcu_read_unlock();
334 bi = bio_alloc(GFP_KERNEL, 0); 405 bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
335 bi->bi_end_io = md_end_barrier; 406 bi->bi_end_io = md_end_flush;
336 bi->bi_private = rdev; 407 bi->bi_private = rdev;
337 bi->bi_bdev = rdev->bdev; 408 bi->bi_bdev = rdev->bdev;
338 atomic_inc(&mddev->flush_pending); 409 atomic_inc(&mddev->flush_pending);
339 submit_bio(WRITE_BARRIER, bi); 410 submit_bio(WRITE_FLUSH, bi);
340 rcu_read_lock(); 411 rcu_read_lock();
341 rdev_dec_pending(rdev, mddev); 412 rdev_dec_pending(rdev, mddev);
342 } 413 }
343 rcu_read_unlock(); 414 rcu_read_unlock();
415 if (atomic_dec_and_test(&mddev->flush_pending))
416 queue_work(md_wq, &mddev->flush_work);
344} 417}
345 418
346static void md_submit_barrier(struct work_struct *ws) 419static void md_submit_flush_data(struct work_struct *ws)
347{ 420{
348 mddev_t *mddev = container_of(ws, mddev_t, barrier_work); 421 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
349 struct bio *bio = mddev->barrier; 422 struct bio *bio = mddev->flush_bio;
350
351 atomic_set(&mddev->flush_pending, 1);
352 423
353 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 424 if (bio->bi_size == 0)
354 bio_endio(bio, -EOPNOTSUPP);
355 else if (bio->bi_size == 0)
356 /* an empty barrier - all done */ 425 /* an empty barrier - all done */
357 bio_endio(bio, 0); 426 bio_endio(bio, 0);
358 else { 427 else {
359 bio->bi_rw &= ~REQ_HARDBARRIER; 428 bio->bi_rw &= ~REQ_FLUSH;
360 if (mddev->pers->make_request(mddev, bio)) 429 if (mddev->pers->make_request(mddev, bio))
361 generic_make_request(bio); 430 generic_make_request(bio);
362 mddev->barrier = POST_REQUEST_BARRIER;
363 submit_barriers(mddev);
364 }
365 if (atomic_dec_and_test(&mddev->flush_pending)) {
366 mddev->barrier = NULL;
367 wake_up(&mddev->sb_wait);
368 } 431 }
432
433 mddev->flush_bio = NULL;
434 wake_up(&mddev->sb_wait);
369} 435}
370 436
371void md_barrier_request(mddev_t *mddev, struct bio *bio) 437void md_flush_request(mddev_t *mddev, struct bio *bio)
372{ 438{
373 spin_lock_irq(&mddev->write_lock); 439 spin_lock_irq(&mddev->write_lock);
374 wait_event_lock_irq(mddev->sb_wait, 440 wait_event_lock_irq(mddev->sb_wait,
375 !mddev->barrier, 441 !mddev->flush_bio,
376 mddev->write_lock, /*nothing*/); 442 mddev->write_lock, /*nothing*/);
377 mddev->barrier = bio; 443 mddev->flush_bio = bio;
378 spin_unlock_irq(&mddev->write_lock); 444 spin_unlock_irq(&mddev->write_lock);
379 445
380 atomic_set(&mddev->flush_pending, 1); 446 INIT_WORK(&mddev->flush_work, submit_flushes);
381 INIT_WORK(&mddev->barrier_work, md_submit_barrier); 447 queue_work(md_wq, &mddev->flush_work);
382
383 submit_barriers(mddev);
384
385 if (atomic_dec_and_test(&mddev->flush_pending))
386 schedule_work(&mddev->barrier_work);
387} 448}
388EXPORT_SYMBOL(md_barrier_request); 449EXPORT_SYMBOL(md_flush_request);
389 450
390/* Support for plugging. 451/* Support for plugging.
391 * This mirrors the plugging support in request_queue, but does not 452 * This mirrors the plugging support in request_queue, but does not
392 * require having a whole queue 453 * require having a whole queue or request structures.
454 * We allocate an md_plug_cb for each md device and each thread it gets
455 * plugged on. This links tot the private plug_handle structure in the
456 * personality data where we keep a count of the number of outstanding
457 * plugs so other code can see if a plug is active.
393 */ 458 */
394static void plugger_work(struct work_struct *work) 459struct md_plug_cb {
395{ 460 struct blk_plug_cb cb;
396 struct plug_handle *plug = 461 mddev_t *mddev;
397 container_of(work, struct plug_handle, unplug_work); 462};
398 plug->unplug_fn(plug);
399}
400static void plugger_timeout(unsigned long data)
401{
402 struct plug_handle *plug = (void *)data;
403 kblockd_schedule_work(NULL, &plug->unplug_work);
404}
405void plugger_init(struct plug_handle *plug,
406 void (*unplug_fn)(struct plug_handle *))
407{
408 plug->unplug_flag = 0;
409 plug->unplug_fn = unplug_fn;
410 init_timer(&plug->unplug_timer);
411 plug->unplug_timer.function = plugger_timeout;
412 plug->unplug_timer.data = (unsigned long)plug;
413 INIT_WORK(&plug->unplug_work, plugger_work);
414}
415EXPORT_SYMBOL_GPL(plugger_init);
416 463
417void plugger_set_plug(struct plug_handle *plug) 464static void plugger_unplug(struct blk_plug_cb *cb)
418{ 465{
419 if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) 466 struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
420 mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); 467 if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
468 md_wakeup_thread(mdcb->mddev->thread);
469 kfree(mdcb);
421} 470}
422EXPORT_SYMBOL_GPL(plugger_set_plug);
423 471
424int plugger_remove_plug(struct plug_handle *plug) 472/* Check that an unplug wakeup will come shortly.
473 * If not, wakeup the md thread immediately
474 */
475int mddev_check_plugged(mddev_t *mddev)
425{ 476{
426 if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { 477 struct blk_plug *plug = current->plug;
427 del_timer(&plug->unplug_timer); 478 struct md_plug_cb *mdcb;
428 return 1; 479
429 } else 480 if (!plug)
430 return 0; 481 return 0;
431}
432EXPORT_SYMBOL_GPL(plugger_remove_plug);
433 482
483 list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
484 if (mdcb->cb.callback == plugger_unplug &&
485 mdcb->mddev == mddev) {
486 /* Already on the list, move to top */
487 if (mdcb != list_first_entry(&plug->cb_list,
488 struct md_plug_cb,
489 cb.list))
490 list_move(&mdcb->cb.list, &plug->cb_list);
491 return 1;
492 }
493 }
494 /* Not currently on the callback list */
495 mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
496 if (!mdcb)
497 return 0;
498
499 mdcb->mddev = mddev;
500 mdcb->cb.callback = plugger_unplug;
501 atomic_inc(&mddev->plug_cnt);
502 list_add(&mdcb->cb.list, &plug->cb_list);
503 return 1;
504}
505EXPORT_SYMBOL_GPL(mddev_check_plugged);
434 506
435static inline mddev_t *mddev_get(mddev_t *mddev) 507static inline mddev_t *mddev_get(mddev_t *mddev)
436{ 508{
@@ -442,6 +514,8 @@ static void mddev_delayed_delete(struct work_struct *ws);
442 514
443static void mddev_put(mddev_t *mddev) 515static void mddev_put(mddev_t *mddev)
444{ 516{
517 struct bio_set *bs = NULL;
518
445 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 519 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
446 return; 520 return;
447 if (!mddev->raid_disks && list_empty(&mddev->disks) && 521 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -449,19 +523,22 @@ static void mddev_put(mddev_t *mddev)
449 /* Array is not configured at all, and not held active, 523 /* Array is not configured at all, and not held active,
450 * so destroy it */ 524 * so destroy it */
451 list_del(&mddev->all_mddevs); 525 list_del(&mddev->all_mddevs);
526 bs = mddev->bio_set;
527 mddev->bio_set = NULL;
452 if (mddev->gendisk) { 528 if (mddev->gendisk) {
453 /* we did a probe so need to clean up. 529 /* We did a probe so need to clean up. Call
454 * Call schedule_work inside the spinlock 530 * queue_work inside the spinlock so that
455 * so that flush_scheduled_work() after 531 * flush_workqueue() after mddev_find will
456 * mddev_find will succeed in waiting for the 532 * succeed in waiting for the work to be done.
457 * work to be done.
458 */ 533 */
459 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 534 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
460 schedule_work(&mddev->del_work); 535 queue_work(md_misc_wq, &mddev->del_work);
461 } else 536 } else
462 kfree(mddev); 537 kfree(mddev);
463 } 538 }
464 spin_unlock(&all_mddevs_lock); 539 spin_unlock(&all_mddevs_lock);
540 if (bs)
541 bioset_free(bs);
465} 542}
466 543
467void mddev_init(mddev_t *mddev) 544void mddev_init(mddev_t *mddev)
@@ -475,6 +552,7 @@ void mddev_init(mddev_t *mddev)
475 atomic_set(&mddev->active, 1); 552 atomic_set(&mddev->active, 1);
476 atomic_set(&mddev->openers, 0); 553 atomic_set(&mddev->openers, 0);
477 atomic_set(&mddev->active_io, 0); 554 atomic_set(&mddev->active_io, 0);
555 atomic_set(&mddev->plug_cnt, 0);
478 spin_lock_init(&mddev->write_lock); 556 spin_lock_init(&mddev->write_lock);
479 atomic_set(&mddev->flush_pending, 0); 557 atomic_set(&mddev->flush_pending, 0);
480 init_waitqueue_head(&mddev->sb_wait); 558 init_waitqueue_head(&mddev->sb_wait);
@@ -490,6 +568,9 @@ static mddev_t * mddev_find(dev_t unit)
490{ 568{
491 mddev_t *mddev, *new = NULL; 569 mddev_t *mddev, *new = NULL;
492 570
571 if (unit && MAJOR(unit) != MD_MAJOR)
572 unit &= ~((1<<MdpMinorShift)-1);
573
493 retry: 574 retry:
494 spin_lock(&all_mddevs_lock); 575 spin_lock(&all_mddevs_lock);
495 576
@@ -647,9 +728,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
647} 728}
648 729
649/* return the offset of the super block in 512byte sectors */ 730/* return the offset of the super block in 512byte sectors */
650static inline sector_t calc_dev_sboffset(struct block_device *bdev) 731static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
651{ 732{
652 sector_t num_sectors = bdev->bd_inode->i_size / 512; 733 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
653 return MD_NEW_SIZE_SECTORS(num_sectors); 734 return MD_NEW_SIZE_SECTORS(num_sectors);
654} 735}
655 736
@@ -696,31 +777,6 @@ static void super_written(struct bio *bio, int error)
696 bio_put(bio); 777 bio_put(bio);
697} 778}
698 779
699static void super_written_barrier(struct bio *bio, int error)
700{
701 struct bio *bio2 = bio->bi_private;
702 mdk_rdev_t *rdev = bio2->bi_private;
703 mddev_t *mddev = rdev->mddev;
704
705 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
706 error == -EOPNOTSUPP) {
707 unsigned long flags;
708 /* barriers don't appear to be supported :-( */
709 set_bit(BarriersNotsupp, &rdev->flags);
710 mddev->barriers_work = 0;
711 spin_lock_irqsave(&mddev->write_lock, flags);
712 bio2->bi_next = mddev->biolist;
713 mddev->biolist = bio2;
714 spin_unlock_irqrestore(&mddev->write_lock, flags);
715 wake_up(&mddev->sb_wait);
716 bio_put(bio);
717 } else {
718 bio_put(bio2);
719 bio->bi_private = rdev;
720 super_written(bio, error);
721 }
722}
723
724void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 780void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
725 sector_t sector, int size, struct page *page) 781 sector_t sector, int size, struct page *page)
726{ 782{
@@ -729,51 +785,27 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
729 * and decrement it on completion, waking up sb_wait 785 * and decrement it on completion, waking up sb_wait
730 * if zero is reached. 786 * if zero is reached.
731 * If an error occurred, call md_error 787 * If an error occurred, call md_error
732 *
733 * As we might need to resubmit the request if REQ_HARDBARRIER
734 * causes ENOTSUPP, we allocate a spare bio...
735 */ 788 */
736 struct bio *bio = bio_alloc(GFP_NOIO, 1); 789 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
737 int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
738 790
739 bio->bi_bdev = rdev->bdev; 791 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
740 bio->bi_sector = sector; 792 bio->bi_sector = sector;
741 bio_add_page(bio, page, size, 0); 793 bio_add_page(bio, page, size, 0);
742 bio->bi_private = rdev; 794 bio->bi_private = rdev;
743 bio->bi_end_io = super_written; 795 bio->bi_end_io = super_written;
744 bio->bi_rw = rw;
745 796
746 atomic_inc(&mddev->pending_writes); 797 atomic_inc(&mddev->pending_writes);
747 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 798 submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
748 struct bio *rbio;
749 rw |= REQ_HARDBARRIER;
750 rbio = bio_clone(bio, GFP_NOIO);
751 rbio->bi_private = bio;
752 rbio->bi_end_io = super_written_barrier;
753 submit_bio(rw, rbio);
754 } else
755 submit_bio(rw, bio);
756} 799}
757 800
758void md_super_wait(mddev_t *mddev) 801void md_super_wait(mddev_t *mddev)
759{ 802{
760 /* wait for all superblock writes that were scheduled to complete. 803 /* wait for all superblock writes that were scheduled to complete */
761 * if any had to be retried (due to BARRIER problems), retry them
762 */
763 DEFINE_WAIT(wq); 804 DEFINE_WAIT(wq);
764 for(;;) { 805 for(;;) {
765 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 806 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
766 if (atomic_read(&mddev->pending_writes)==0) 807 if (atomic_read(&mddev->pending_writes)==0)
767 break; 808 break;
768 while (mddev->biolist) {
769 struct bio *bio;
770 spin_lock_irq(&mddev->write_lock);
771 bio = mddev->biolist;
772 mddev->biolist = bio->bi_next ;
773 bio->bi_next = NULL;
774 spin_unlock_irq(&mddev->write_lock);
775 submit_bio(bio->bi_rw, bio);
776 }
777 schedule(); 809 schedule();
778 } 810 }
779 finish_wait(&mddev->sb_wait, &wq); 811 finish_wait(&mddev->sb_wait, &wq);
@@ -784,17 +816,21 @@ static void bi_complete(struct bio *bio, int error)
784 complete((struct completion*)bio->bi_private); 816 complete((struct completion*)bio->bi_private);
785} 817}
786 818
787int sync_page_io(struct block_device *bdev, sector_t sector, int size, 819int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
788 struct page *page, int rw) 820 struct page *page, int rw, bool metadata_op)
789{ 821{
790 struct bio *bio = bio_alloc(GFP_NOIO, 1); 822 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
791 struct completion event; 823 struct completion event;
792 int ret; 824 int ret;
793 825
794 rw |= REQ_SYNC | REQ_UNPLUG; 826 rw |= REQ_SYNC;
795 827
796 bio->bi_bdev = bdev; 828 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
797 bio->bi_sector = sector; 829 rdev->meta_bdev : rdev->bdev;
830 if (metadata_op)
831 bio->bi_sector = sector + rdev->sb_start;
832 else
833 bio->bi_sector = sector + rdev->data_offset;
798 bio_add_page(bio, page, size, 0); 834 bio_add_page(bio, page, size, 0);
799 init_completion(&event); 835 init_completion(&event);
800 bio->bi_private = &event; 836 bio->bi_private = &event;
@@ -819,7 +855,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
819 return 0; 855 return 0;
820 856
821 857
822 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 858 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
823 goto fail; 859 goto fail;
824 rdev->sb_loaded = 1; 860 rdev->sb_loaded = 1;
825 return 0; 861 return 0;
@@ -981,7 +1017,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
981 * 1017 *
982 * It also happens to be a multiple of 4Kb. 1018 * It also happens to be a multiple of 4Kb.
983 */ 1019 */
984 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1020 rdev->sb_start = calc_dev_sboffset(rdev);
985 1021
986 ret = read_disk_sb(rdev, MD_SB_BYTES); 1022 ret = read_disk_sb(rdev, MD_SB_BYTES);
987 if (ret) return ret; 1023 if (ret) return ret;
@@ -1070,7 +1106,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1070 clear_bit(Faulty, &rdev->flags); 1106 clear_bit(Faulty, &rdev->flags);
1071 clear_bit(In_sync, &rdev->flags); 1107 clear_bit(In_sync, &rdev->flags);
1072 clear_bit(WriteMostly, &rdev->flags); 1108 clear_bit(WriteMostly, &rdev->flags);
1073 clear_bit(BarriersNotsupp, &rdev->flags);
1074 1109
1075 if (mddev->raid_disks == 0) { 1110 if (mddev->raid_disks == 0) {
1076 mddev->major_version = 0; 1111 mddev->major_version = 0;
@@ -1323,13 +1358,13 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1323 return 0; /* component must fit device */ 1358 return 0; /* component must fit device */
1324 if (rdev->mddev->bitmap_info.offset) 1359 if (rdev->mddev->bitmap_info.offset)
1325 return 0; /* can't move bitmap */ 1360 return 0; /* can't move bitmap */
1326 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1361 rdev->sb_start = calc_dev_sboffset(rdev);
1327 if (!num_sectors || num_sectors > rdev->sb_start) 1362 if (!num_sectors || num_sectors > rdev->sb_start)
1328 num_sectors = rdev->sb_start; 1363 num_sectors = rdev->sb_start;
1329 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1364 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1330 rdev->sb_page); 1365 rdev->sb_page);
1331 md_super_wait(rdev->mddev); 1366 md_super_wait(rdev->mddev);
1332 return num_sectors / 2; /* kB for sysfs */ 1367 return num_sectors;
1333} 1368}
1334 1369
1335 1370
@@ -1378,7 +1413,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1378 */ 1413 */
1379 switch(minor_version) { 1414 switch(minor_version) {
1380 case 0: 1415 case 0:
1381 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1416 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1382 sb_start -= 8*2; 1417 sb_start -= 8*2;
1383 sb_start &= ~(sector_t)(4*2-1); 1418 sb_start &= ~(sector_t)(4*2-1);
1384 break; 1419 break;
@@ -1464,7 +1499,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1464 ret = 0; 1499 ret = 0;
1465 } 1500 }
1466 if (minor_version) 1501 if (minor_version)
1467 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - 1502 rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
1468 le64_to_cpu(sb->data_offset); 1503 le64_to_cpu(sb->data_offset);
1469 else 1504 else
1470 rdev->sectors = rdev->sb_start; 1505 rdev->sectors = rdev->sb_start;
@@ -1485,7 +1520,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1485 clear_bit(Faulty, &rdev->flags); 1520 clear_bit(Faulty, &rdev->flags);
1486 clear_bit(In_sync, &rdev->flags); 1521 clear_bit(In_sync, &rdev->flags);
1487 clear_bit(WriteMostly, &rdev->flags); 1522 clear_bit(WriteMostly, &rdev->flags);
1488 clear_bit(BarriersNotsupp, &rdev->flags);
1489 1523
1490 if (mddev->raid_disks == 0) { 1524 if (mddev->raid_disks == 0) {
1491 mddev->major_version = 1; 1525 mddev->major_version = 1;
@@ -1673,7 +1707,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1673 return 0; /* component must fit device */ 1707 return 0; /* component must fit device */
1674 if (rdev->sb_start < rdev->data_offset) { 1708 if (rdev->sb_start < rdev->data_offset) {
1675 /* minor versions 1 and 2; superblock before data */ 1709 /* minor versions 1 and 2; superblock before data */
1676 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1710 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1677 max_sectors -= rdev->data_offset; 1711 max_sectors -= rdev->data_offset;
1678 if (!num_sectors || num_sectors > max_sectors) 1712 if (!num_sectors || num_sectors > max_sectors)
1679 num_sectors = max_sectors; 1713 num_sectors = max_sectors;
@@ -1683,7 +1717,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1683 } else { 1717 } else {
1684 /* minor version 0; superblock after data */ 1718 /* minor version 0; superblock after data */
1685 sector_t sb_start; 1719 sector_t sb_start;
1686 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1720 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1687 sb_start &= ~(sector_t)(4*2 - 1); 1721 sb_start &= ~(sector_t)(4*2 - 1);
1688 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1722 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1689 if (!num_sectors || num_sectors > max_sectors) 1723 if (!num_sectors || num_sectors > max_sectors)
@@ -1697,7 +1731,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1697 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1731 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1698 rdev->sb_page); 1732 rdev->sb_page);
1699 md_super_wait(rdev->mddev); 1733 md_super_wait(rdev->mddev);
1700 return num_sectors / 2; /* kB for sysfs */ 1734 return num_sectors;
1701} 1735}
1702 1736
1703static struct super_type super_types[] = { 1737static struct super_type super_types[] = {
@@ -1719,6 +1753,18 @@ static struct super_type super_types[] = {
1719 }, 1753 },
1720}; 1754};
1721 1755
1756static void sync_super(mddev_t *mddev, mdk_rdev_t *rdev)
1757{
1758 if (mddev->sync_super) {
1759 mddev->sync_super(mddev, rdev);
1760 return;
1761 }
1762
1763 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1764
1765 super_types[mddev->major_version].sync_super(mddev, rdev);
1766}
1767
1722static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1768static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1723{ 1769{
1724 mdk_rdev_t *rdev, *rdev2; 1770 mdk_rdev_t *rdev, *rdev2;
@@ -1750,20 +1796,14 @@ int md_integrity_register(mddev_t *mddev)
1750 1796
1751 if (list_empty(&mddev->disks)) 1797 if (list_empty(&mddev->disks))
1752 return 0; /* nothing to do */ 1798 return 0; /* nothing to do */
1753 if (blk_get_integrity(mddev->gendisk)) 1799 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1754 return 0; /* already registered */ 1800 return 0; /* shouldn't register, or already is */
1755 list_for_each_entry(rdev, &mddev->disks, same_set) { 1801 list_for_each_entry(rdev, &mddev->disks, same_set) {
1756 /* skip spares and non-functional disks */ 1802 /* skip spares and non-functional disks */
1757 if (test_bit(Faulty, &rdev->flags)) 1803 if (test_bit(Faulty, &rdev->flags))
1758 continue; 1804 continue;
1759 if (rdev->raid_disk < 0) 1805 if (rdev->raid_disk < 0)
1760 continue; 1806 continue;
1761 /*
1762 * If at least one rdev is not integrity capable, we can not
1763 * enable data integrity for the md device.
1764 */
1765 if (!bdev_get_integrity(rdev->bdev))
1766 return -EINVAL;
1767 if (!reference) { 1807 if (!reference) {
1768 /* Use the first rdev as the reference */ 1808 /* Use the first rdev as the reference */
1769 reference = rdev; 1809 reference = rdev;
@@ -1774,6 +1814,8 @@ int md_integrity_register(mddev_t *mddev)
1774 rdev->bdev->bd_disk) < 0) 1814 rdev->bdev->bd_disk) < 0)
1775 return -EINVAL; 1815 return -EINVAL;
1776 } 1816 }
1817 if (!reference || !bdev_get_integrity(reference->bdev))
1818 return 0;
1777 /* 1819 /*
1778 * All component devices are integrity capable and have matching 1820 * All component devices are integrity capable and have matching
1779 * profiles, register the common profile for the md device. 1821 * profiles, register the common profile for the md device.
@@ -1784,8 +1826,12 @@ int md_integrity_register(mddev_t *mddev)
1784 mdname(mddev)); 1826 mdname(mddev));
1785 return -EINVAL; 1827 return -EINVAL;
1786 } 1828 }
1787 printk(KERN_NOTICE "md: data integrity on %s enabled\n", 1829 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
1788 mdname(mddev)); 1830 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
1831 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
1832 mdname(mddev));
1833 return -EINVAL;
1834 }
1789 return 0; 1835 return 0;
1790} 1836}
1791EXPORT_SYMBOL(md_integrity_register); 1837EXPORT_SYMBOL(md_integrity_register);
@@ -1873,7 +1919,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1873 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 1919 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
1874 1920
1875 list_add_rcu(&rdev->same_set, &mddev->disks); 1921 list_add_rcu(&rdev->same_set, &mddev->disks);
1876 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1922 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
1877 1923
1878 /* May as well allow recovery to be retried once */ 1924 /* May as well allow recovery to be retried once */
1879 mddev->recovery_disabled = 0; 1925 mddev->recovery_disabled = 0;
@@ -1900,7 +1946,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1900 MD_BUG(); 1946 MD_BUG();
1901 return; 1947 return;
1902 } 1948 }
1903 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1949 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
1904 list_del_rcu(&rdev->same_set); 1950 list_del_rcu(&rdev->same_set);
1905 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1951 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1906 rdev->mddev = NULL; 1952 rdev->mddev = NULL;
@@ -1914,7 +1960,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1914 synchronize_rcu(); 1960 synchronize_rcu();
1915 INIT_WORK(&rdev->del_work, md_delayed_delete); 1961 INIT_WORK(&rdev->del_work, md_delayed_delete);
1916 kobject_get(&rdev->kobj); 1962 kobject_get(&rdev->kobj);
1917 schedule_work(&rdev->del_work); 1963 queue_work(md_misc_wq, &rdev->del_work);
1918} 1964}
1919 1965
1920/* 1966/*
@@ -1928,21 +1974,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1928 struct block_device *bdev; 1974 struct block_device *bdev;
1929 char b[BDEVNAME_SIZE]; 1975 char b[BDEVNAME_SIZE];
1930 1976
1931 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1977 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1978 shared ? (mdk_rdev_t *)lock_rdev : rdev);
1932 if (IS_ERR(bdev)) { 1979 if (IS_ERR(bdev)) {
1933 printk(KERN_ERR "md: could not open %s.\n", 1980 printk(KERN_ERR "md: could not open %s.\n",
1934 __bdevname(dev, b)); 1981 __bdevname(dev, b));
1935 return PTR_ERR(bdev); 1982 return PTR_ERR(bdev);
1936 } 1983 }
1937 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1938 if (err) {
1939 printk(KERN_ERR "md: could not bd_claim %s.\n",
1940 bdevname(bdev, b));
1941 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1942 return err;
1943 }
1944 if (!shared)
1945 set_bit(AllReserved, &rdev->flags);
1946 rdev->bdev = bdev; 1984 rdev->bdev = bdev;
1947 return err; 1985 return err;
1948} 1986}
@@ -1953,8 +1991,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
1953 rdev->bdev = NULL; 1991 rdev->bdev = NULL;
1954 if (!bdev) 1992 if (!bdev)
1955 MD_BUG(); 1993 MD_BUG();
1956 bd_release(bdev); 1994 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1957 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1958} 1995}
1959 1996
1960void md_autodetect_dev(dev_t dev); 1997void md_autodetect_dev(dev_t dev);
@@ -2146,8 +2183,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
2146 /* Don't update this superblock */ 2183 /* Don't update this superblock */
2147 rdev->sb_loaded = 2; 2184 rdev->sb_loaded = 2;
2148 } else { 2185 } else {
2149 super_types[mddev->major_version]. 2186 sync_super(mddev, rdev);
2150 sync_super(mddev, rdev);
2151 rdev->sb_loaded = 1; 2187 rdev->sb_loaded = 1;
2152 } 2188 }
2153 } 2189 }
@@ -2172,6 +2208,8 @@ repeat:
2172 if (!mddev->persistent) { 2208 if (!mddev->persistent) {
2173 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2209 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2174 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2210 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2211 if (!mddev->external)
2212 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2175 wake_up(&mddev->sb_wait); 2213 wake_up(&mddev->sb_wait);
2176 return; 2214 return;
2177 } 2215 }
@@ -2438,7 +2476,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2438 if (rdev->raid_disk == -1) 2476 if (rdev->raid_disk == -1)
2439 return -EEXIST; 2477 return -EEXIST;
2440 /* personality does all needed checks */ 2478 /* personality does all needed checks */
2441 if (rdev->mddev->pers->hot_add_disk == NULL) 2479 if (rdev->mddev->pers->hot_remove_disk == NULL)
2442 return -EINVAL; 2480 return -EINVAL;
2443 err = rdev->mddev->pers-> 2481 err = rdev->mddev->pers->
2444 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2482 hot_remove_disk(rdev->mddev, rdev->raid_disk);
@@ -2458,6 +2496,9 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2458 if (rdev->raid_disk != -1) 2496 if (rdev->raid_disk != -1)
2459 return -EBUSY; 2497 return -EBUSY;
2460 2498
2499 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2500 return -EBUSY;
2501
2461 if (rdev->mddev->pers->hot_add_disk == NULL) 2502 if (rdev->mddev->pers->hot_add_disk == NULL)
2462 return -EINVAL; 2503 return -EINVAL;
2463 2504
@@ -2465,6 +2506,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2465 if (rdev2->raid_disk == slot) 2506 if (rdev2->raid_disk == slot)
2466 return -EEXIST; 2507 return -EEXIST;
2467 2508
2509 if (slot >= rdev->mddev->raid_disks &&
2510 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2511 return -ENOSPC;
2512
2468 rdev->raid_disk = slot; 2513 rdev->raid_disk = slot;
2469 if (test_bit(In_sync, &rdev->flags)) 2514 if (test_bit(In_sync, &rdev->flags))
2470 rdev->saved_raid_disk = slot; 2515 rdev->saved_raid_disk = slot;
@@ -2482,7 +2527,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2482 /* failure here is OK */; 2527 /* failure here is OK */;
2483 /* don't wakeup anyone, leave that to userspace. */ 2528 /* don't wakeup anyone, leave that to userspace. */
2484 } else { 2529 } else {
2485 if (slot >= rdev->mddev->raid_disks) 2530 if (slot >= rdev->mddev->raid_disks &&
2531 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2486 return -ENOSPC; 2532 return -ENOSPC;
2487 rdev->raid_disk = slot; 2533 rdev->raid_disk = slot;
2488 /* assume it is working */ 2534 /* assume it is working */
@@ -2575,7 +2621,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2575 if (!sectors) 2621 if (!sectors)
2576 return -EBUSY; 2622 return -EBUSY;
2577 } else if (!sectors) 2623 } else if (!sectors)
2578 sectors = (rdev->bdev->bd_inode->i_size >> 9) - 2624 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2579 rdev->data_offset; 2625 rdev->data_offset;
2580 } 2626 }
2581 if (sectors < my_mddev->dev_sectors) 2627 if (sectors < my_mddev->dev_sectors)
@@ -2598,12 +2644,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2598 2644
2599 mddev_lock(mddev); 2645 mddev_lock(mddev);
2600 list_for_each_entry(rdev2, &mddev->disks, same_set) 2646 list_for_each_entry(rdev2, &mddev->disks, same_set)
2601 if (test_bit(AllReserved, &rdev2->flags) || 2647 if (rdev->bdev == rdev2->bdev &&
2602 (rdev->bdev == rdev2->bdev && 2648 rdev != rdev2 &&
2603 rdev != rdev2 && 2649 overlaps(rdev->data_offset, rdev->sectors,
2604 overlaps(rdev->data_offset, rdev->sectors, 2650 rdev2->data_offset,
2605 rdev2->data_offset, 2651 rdev2->sectors)) {
2606 rdev2->sectors))) {
2607 overlap = 1; 2652 overlap = 1;
2608 break; 2653 break;
2609 } 2654 }
@@ -2788,7 +2833,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2788 2833
2789 kobject_init(&rdev->kobj, &rdev_ktype); 2834 kobject_init(&rdev->kobj, &rdev_ktype);
2790 2835
2791 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2836 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
2792 if (!size) { 2837 if (!size) {
2793 printk(KERN_WARNING 2838 printk(KERN_WARNING
2794 "md: %s has zero or unknown size, marking faulty!\n", 2839 "md: %s has zero or unknown size, marking faulty!\n",
@@ -3107,7 +3152,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3107 char nm[20]; 3152 char nm[20];
3108 if (rdev->raid_disk < 0) 3153 if (rdev->raid_disk < 0)
3109 continue; 3154 continue;
3110 if (rdev->new_raid_disk > mddev->raid_disks) 3155 if (rdev->new_raid_disk >= mddev->raid_disks)
3111 rdev->new_raid_disk = -1; 3156 rdev->new_raid_disk = -1;
3112 if (rdev->new_raid_disk == rdev->raid_disk) 3157 if (rdev->new_raid_disk == rdev->raid_disk)
3113 continue; 3158 continue;
@@ -3139,6 +3184,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3139 mddev->layout = mddev->new_layout; 3184 mddev->layout = mddev->new_layout;
3140 mddev->chunk_sectors = mddev->new_chunk_sectors; 3185 mddev->chunk_sectors = mddev->new_chunk_sectors;
3141 mddev->delta_disks = 0; 3186 mddev->delta_disks = 0;
3187 mddev->degraded = 0;
3142 if (mddev->pers->sync_request == NULL) { 3188 if (mddev->pers->sync_request == NULL) {
3143 /* this is now an array without redundancy, so 3189 /* this is now an array without redundancy, so
3144 * it must always be in_sync 3190 * it must always be in_sync
@@ -3292,7 +3338,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
3292 char *e; 3338 char *e;
3293 unsigned long long n = simple_strtoull(buf, &e, 10); 3339 unsigned long long n = simple_strtoull(buf, &e, 10);
3294 3340
3295 if (mddev->pers) 3341 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3296 return -EBUSY; 3342 return -EBUSY;
3297 if (cmd_match(buf, "none")) 3343 if (cmd_match(buf, "none"))
3298 n = MaxSector; 3344 n = MaxSector;
@@ -3736,6 +3782,8 @@ action_show(mddev_t *mddev, char *page)
3736 return sprintf(page, "%s\n", type); 3782 return sprintf(page, "%s\n", type);
3737} 3783}
3738 3784
3785static void reap_sync_thread(mddev_t *mddev);
3786
3739static ssize_t 3787static ssize_t
3740action_store(mddev_t *mddev, const char *page, size_t len) 3788action_store(mddev_t *mddev, const char *page, size_t len)
3741{ 3789{
@@ -3750,9 +3798,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
3750 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 3798 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3751 if (mddev->sync_thread) { 3799 if (mddev->sync_thread) {
3752 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3800 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3753 md_unregister_thread(mddev->sync_thread); 3801 reap_sync_thread(mddev);
3754 mddev->sync_thread = NULL;
3755 mddev->recovery = 0;
3756 } 3802 }
3757 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3803 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3758 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3804 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -3904,7 +3950,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3904static ssize_t 3950static ssize_t
3905sync_completed_show(mddev_t *mddev, char *page) 3951sync_completed_show(mddev_t *mddev, char *page)
3906{ 3952{
3907 unsigned long max_sectors, resync; 3953 unsigned long long max_sectors, resync;
3908 3954
3909 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3955 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3910 return sprintf(page, "none\n"); 3956 return sprintf(page, "none\n");
@@ -3915,7 +3961,7 @@ sync_completed_show(mddev_t *mddev, char *page)
3915 max_sectors = mddev->dev_sectors; 3961 max_sectors = mddev->dev_sectors;
3916 3962
3917 resync = mddev->curr_resync_completed; 3963 resync = mddev->curr_resync_completed;
3918 return sprintf(page, "%lu / %lu\n", resync, max_sectors); 3964 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
3919} 3965}
3920 3966
3921static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3967static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -4002,19 +4048,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
4002{ 4048{
4003 char *e; 4049 char *e;
4004 unsigned long long new = simple_strtoull(buf, &e, 10); 4050 unsigned long long new = simple_strtoull(buf, &e, 10);
4051 unsigned long long old = mddev->suspend_lo;
4005 4052
4006 if (mddev->pers == NULL || 4053 if (mddev->pers == NULL ||
4007 mddev->pers->quiesce == NULL) 4054 mddev->pers->quiesce == NULL)
4008 return -EINVAL; 4055 return -EINVAL;
4009 if (buf == e || (*e && *e != '\n')) 4056 if (buf == e || (*e && *e != '\n'))
4010 return -EINVAL; 4057 return -EINVAL;
4011 if (new >= mddev->suspend_hi || 4058
4012 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 4059 mddev->suspend_lo = new;
4013 mddev->suspend_lo = new; 4060 if (new >= old)
4061 /* Shrinking suspended region */
4014 mddev->pers->quiesce(mddev, 2); 4062 mddev->pers->quiesce(mddev, 2);
4015 return len; 4063 else {
4016 } else 4064 /* Expanding suspended region - need to wait */
4017 return -EINVAL; 4065 mddev->pers->quiesce(mddev, 1);
4066 mddev->pers->quiesce(mddev, 0);
4067 }
4068 return len;
4018} 4069}
4019static struct md_sysfs_entry md_suspend_lo = 4070static struct md_sysfs_entry md_suspend_lo =
4020__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4071__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4031,20 +4082,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
4031{ 4082{
4032 char *e; 4083 char *e;
4033 unsigned long long new = simple_strtoull(buf, &e, 10); 4084 unsigned long long new = simple_strtoull(buf, &e, 10);
4085 unsigned long long old = mddev->suspend_hi;
4034 4086
4035 if (mddev->pers == NULL || 4087 if (mddev->pers == NULL ||
4036 mddev->pers->quiesce == NULL) 4088 mddev->pers->quiesce == NULL)
4037 return -EINVAL; 4089 return -EINVAL;
4038 if (buf == e || (*e && *e != '\n')) 4090 if (buf == e || (*e && *e != '\n'))
4039 return -EINVAL; 4091 return -EINVAL;
4040 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 4092
4041 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 4093 mddev->suspend_hi = new;
4042 mddev->suspend_hi = new; 4094 if (new <= old)
4095 /* Shrinking suspended region */
4096 mddev->pers->quiesce(mddev, 2);
4097 else {
4098 /* Expanding suspended region - need to wait */
4043 mddev->pers->quiesce(mddev, 1); 4099 mddev->pers->quiesce(mddev, 1);
4044 mddev->pers->quiesce(mddev, 0); 4100 mddev->pers->quiesce(mddev, 0);
4045 return len; 4101 }
4046 } else 4102 return len;
4047 return -EINVAL;
4048} 4103}
4049static struct md_sysfs_entry md_suspend_hi = 4104static struct md_sysfs_entry md_suspend_hi =
4050__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4105__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4112,10 +4167,10 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
4112 } 4167 }
4113 4168
4114 mddev->array_sectors = sectors; 4169 mddev->array_sectors = sectors;
4115 set_capacity(mddev->gendisk, mddev->array_sectors); 4170 if (mddev->pers) {
4116 if (mddev->pers) 4171 set_capacity(mddev->gendisk, mddev->array_sectors);
4117 revalidate_disk(mddev->gendisk); 4172 revalidate_disk(mddev->gendisk);
4118 4173 }
4119 return len; 4174 return len;
4120} 4175}
4121 4176
@@ -4256,10 +4311,10 @@ static int md_alloc(dev_t dev, char *name)
4256 shift = partitioned ? MdpMinorShift : 0; 4311 shift = partitioned ? MdpMinorShift : 0;
4257 unit = MINOR(mddev->unit) >> shift; 4312 unit = MINOR(mddev->unit) >> shift;
4258 4313
4259 /* wait for any previous instance if this device 4314 /* wait for any previous instance of this device to be
4260 * to be completed removed (mddev_delayed_delete). 4315 * completely removed (mddev_delayed_delete).
4261 */ 4316 */
4262 flush_scheduled_work(); 4317 flush_workqueue(md_misc_wq);
4263 4318
4264 mutex_lock(&disks_mutex); 4319 mutex_lock(&disks_mutex);
4265 error = -EEXIST; 4320 error = -EEXIST;
@@ -4287,9 +4342,6 @@ static int md_alloc(dev_t dev, char *name)
4287 goto abort; 4342 goto abort;
4288 mddev->queue->queuedata = mddev; 4343 mddev->queue->queuedata = mddev;
4289 4344
4290 /* Can be unlocked because the queue is new: no concurrency */
4291 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
4292
4293 blk_queue_make_request(mddev->queue, md_make_request); 4345 blk_queue_make_request(mddev->queue, md_make_request);
4294 4346
4295 disk = alloc_disk(1 << shift); 4347 disk = alloc_disk(1 << shift);
@@ -4309,13 +4361,19 @@ static int md_alloc(dev_t dev, char *name)
4309 disk->fops = &md_fops; 4361 disk->fops = &md_fops;
4310 disk->private_data = mddev; 4362 disk->private_data = mddev;
4311 disk->queue = mddev->queue; 4363 disk->queue = mddev->queue;
4364 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4312 /* Allow extended partitions. This makes the 4365 /* Allow extended partitions. This makes the
4313 * 'mdp' device redundant, but we can't really 4366 * 'mdp' device redundant, but we can't really
4314 * remove it now. 4367 * remove it now.
4315 */ 4368 */
4316 disk->flags |= GENHD_FL_EXT_DEVT; 4369 disk->flags |= GENHD_FL_EXT_DEVT;
4317 add_disk(disk);
4318 mddev->gendisk = disk; 4370 mddev->gendisk = disk;
4371 /* As soon as we call add_disk(), another thread could get
4372 * through to md_open, so make sure it doesn't get too far
4373 */
4374 mutex_lock(&mddev->open_mutex);
4375 add_disk(disk);
4376
4319 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 4377 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4320 &disk_to_dev(disk)->kobj, "%s", "md"); 4378 &disk_to_dev(disk)->kobj, "%s", "md");
4321 if (error) { 4379 if (error) {
@@ -4329,6 +4387,7 @@ static int md_alloc(dev_t dev, char *name)
4329 if (mddev->kobj.sd && 4387 if (mddev->kobj.sd &&
4330 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4388 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4331 printk(KERN_DEBUG "pointless warning\n"); 4389 printk(KERN_DEBUG "pointless warning\n");
4390 mutex_unlock(&mddev->open_mutex);
4332 abort: 4391 abort:
4333 mutex_unlock(&disks_mutex); 4392 mutex_unlock(&disks_mutex);
4334 if (!error && mddev->kobj.sd) { 4393 if (!error && mddev->kobj.sd) {
@@ -4423,7 +4482,9 @@ int md_run(mddev_t *mddev)
4423 * We don't want the data to overlap the metadata, 4482 * We don't want the data to overlap the metadata,
4424 * Internal Bitmap issues have been handled elsewhere. 4483 * Internal Bitmap issues have been handled elsewhere.
4425 */ 4484 */
4426 if (rdev->data_offset < rdev->sb_start) { 4485 if (rdev->meta_bdev) {
4486 /* Nothing to check */;
4487 } else if (rdev->data_offset < rdev->sb_start) {
4427 if (mddev->dev_sectors && 4488 if (mddev->dev_sectors &&
4428 rdev->data_offset + mddev->dev_sectors 4489 rdev->data_offset + mddev->dev_sectors
4429 > rdev->sb_start) { 4490 > rdev->sb_start) {
@@ -4442,6 +4503,9 @@ int md_run(mddev_t *mddev)
4442 sysfs_notify_dirent_safe(rdev->sysfs_state); 4503 sysfs_notify_dirent_safe(rdev->sysfs_state);
4443 } 4504 }
4444 4505
4506 if (mddev->bio_set == NULL)
4507 mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
4508
4445 spin_lock(&pers_lock); 4509 spin_lock(&pers_lock);
4446 pers = find_pers(mddev->level, mddev->clevel); 4510 pers = find_pers(mddev->level, mddev->clevel);
4447 if (!pers || !try_module_get(pers->owner)) { 4511 if (!pers || !try_module_get(pers->owner)) {
@@ -4504,7 +4568,6 @@ int md_run(mddev_t *mddev)
4504 /* may be over-ridden by personality */ 4568 /* may be over-ridden by personality */
4505 mddev->resync_max_sectors = mddev->dev_sectors; 4569 mddev->resync_max_sectors = mddev->dev_sectors;
4506 4570
4507 mddev->barriers_work = 1;
4508 mddev->ok_start_degraded = start_dirty_degraded; 4571 mddev->ok_start_degraded = start_dirty_degraded;
4509 4572
4510 if (start_readonly && mddev->ro == 0) 4573 if (start_readonly && mddev->ro == 0)
@@ -4555,7 +4618,8 @@ int md_run(mddev_t *mddev)
4555 mddev->safemode_timer.data = (unsigned long) mddev; 4618 mddev->safemode_timer.data = (unsigned long) mddev;
4556 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 4619 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4557 mddev->in_sync = 1; 4620 mddev->in_sync = 1;
4558 4621 smp_wmb();
4622 mddev->ready = 1;
4559 list_for_each_entry(rdev, &mddev->disks, same_set) 4623 list_for_each_entry(rdev, &mddev->disks, same_set)
4560 if (rdev->raid_disk >= 0) { 4624 if (rdev->raid_disk >= 0) {
4561 char nm[20]; 4625 char nm[20];
@@ -4569,9 +4633,6 @@ int md_run(mddev_t *mddev)
4569 if (mddev->flags) 4633 if (mddev->flags)
4570 md_update_sb(mddev, 0); 4634 md_update_sb(mddev, 0);
4571 4635
4572 md_wakeup_thread(mddev->thread);
4573 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4574
4575 md_new_event(mddev); 4636 md_new_event(mddev);
4576 sysfs_notify_dirent_safe(mddev->sysfs_state); 4637 sysfs_notify_dirent_safe(mddev->sysfs_state);
4577 sysfs_notify_dirent_safe(mddev->sysfs_action); 4638 sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -4592,8 +4653,13 @@ static int do_md_run(mddev_t *mddev)
4592 bitmap_destroy(mddev); 4653 bitmap_destroy(mddev);
4593 goto out; 4654 goto out;
4594 } 4655 }
4656
4657 md_wakeup_thread(mddev->thread);
4658 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4659
4595 set_capacity(mddev->gendisk, mddev->array_sectors); 4660 set_capacity(mddev->gendisk, mddev->array_sectors);
4596 revalidate_disk(mddev->gendisk); 4661 revalidate_disk(mddev->gendisk);
4662 mddev->changed = 1;
4597 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4663 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4598out: 4664out:
4599 return err; 4665 return err;
@@ -4682,24 +4748,22 @@ static void md_clean(mddev_t *mddev)
4682 mddev->sync_speed_min = mddev->sync_speed_max = 0; 4748 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4683 mddev->recovery = 0; 4749 mddev->recovery = 0;
4684 mddev->in_sync = 0; 4750 mddev->in_sync = 0;
4751 mddev->changed = 0;
4685 mddev->degraded = 0; 4752 mddev->degraded = 0;
4686 mddev->barriers_work = 0;
4687 mddev->safemode = 0; 4753 mddev->safemode = 0;
4688 mddev->bitmap_info.offset = 0; 4754 mddev->bitmap_info.offset = 0;
4689 mddev->bitmap_info.default_offset = 0; 4755 mddev->bitmap_info.default_offset = 0;
4690 mddev->bitmap_info.chunksize = 0; 4756 mddev->bitmap_info.chunksize = 0;
4691 mddev->bitmap_info.daemon_sleep = 0; 4757 mddev->bitmap_info.daemon_sleep = 0;
4692 mddev->bitmap_info.max_write_behind = 0; 4758 mddev->bitmap_info.max_write_behind = 0;
4693 mddev->plug = NULL;
4694} 4759}
4695 4760
4696void md_stop_writes(mddev_t *mddev) 4761static void __md_stop_writes(mddev_t *mddev)
4697{ 4762{
4698 if (mddev->sync_thread) { 4763 if (mddev->sync_thread) {
4699 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4764 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4700 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4765 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4701 md_unregister_thread(mddev->sync_thread); 4766 reap_sync_thread(mddev);
4702 mddev->sync_thread = NULL;
4703 } 4767 }
4704 4768
4705 del_timer_sync(&mddev->safemode_timer); 4769 del_timer_sync(&mddev->safemode_timer);
@@ -4713,10 +4777,18 @@ void md_stop_writes(mddev_t *mddev)
4713 md_update_sb(mddev, 1); 4777 md_update_sb(mddev, 1);
4714 } 4778 }
4715} 4779}
4780
4781void md_stop_writes(mddev_t *mddev)
4782{
4783 mddev_lock(mddev);
4784 __md_stop_writes(mddev);
4785 mddev_unlock(mddev);
4786}
4716EXPORT_SYMBOL_GPL(md_stop_writes); 4787EXPORT_SYMBOL_GPL(md_stop_writes);
4717 4788
4718void md_stop(mddev_t *mddev) 4789void md_stop(mddev_t *mddev)
4719{ 4790{
4791 mddev->ready = 0;
4720 mddev->pers->stop(mddev); 4792 mddev->pers->stop(mddev);
4721 if (mddev->pers->sync_request && mddev->to_remove == NULL) 4793 if (mddev->pers->sync_request && mddev->to_remove == NULL)
4722 mddev->to_remove = &md_redundancy_group; 4794 mddev->to_remove = &md_redundancy_group;
@@ -4736,7 +4808,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
4736 goto out; 4808 goto out;
4737 } 4809 }
4738 if (mddev->pers) { 4810 if (mddev->pers) {
4739 md_stop_writes(mddev); 4811 __md_stop_writes(mddev);
4740 4812
4741 err = -ENXIO; 4813 err = -ENXIO;
4742 if (mddev->ro==1) 4814 if (mddev->ro==1)
@@ -4773,10 +4845,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4773 if (mddev->ro) 4845 if (mddev->ro)
4774 set_disk_ro(disk, 0); 4846 set_disk_ro(disk, 0);
4775 4847
4776 md_stop_writes(mddev); 4848 __md_stop_writes(mddev);
4777 md_stop(mddev); 4849 md_stop(mddev);
4778 mddev->queue->merge_bvec_fn = NULL; 4850 mddev->queue->merge_bvec_fn = NULL;
4779 mddev->queue->unplug_fn = NULL;
4780 mddev->queue->backing_dev_info.congested_fn = NULL; 4851 mddev->queue->backing_dev_info.congested_fn = NULL;
4781 4852
4782 /* tell userspace to handle 'inactive' */ 4853 /* tell userspace to handle 'inactive' */
@@ -4791,6 +4862,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4791 4862
4792 set_capacity(disk, 0); 4863 set_capacity(disk, 0);
4793 mutex_unlock(&mddev->open_mutex); 4864 mutex_unlock(&mddev->open_mutex);
4865 mddev->changed = 1;
4794 revalidate_disk(disk); 4866 revalidate_disk(disk);
4795 4867
4796 if (mddev->ro) 4868 if (mddev->ro)
@@ -5148,17 +5220,31 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5148 PTR_ERR(rdev)); 5220 PTR_ERR(rdev));
5149 return PTR_ERR(rdev); 5221 return PTR_ERR(rdev);
5150 } 5222 }
5151 /* set save_raid_disk if appropriate */ 5223 /* set saved_raid_disk if appropriate */
5152 if (!mddev->persistent) { 5224 if (!mddev->persistent) {
5153 if (info->state & (1<<MD_DISK_SYNC) && 5225 if (info->state & (1<<MD_DISK_SYNC) &&
5154 info->raid_disk < mddev->raid_disks) 5226 info->raid_disk < mddev->raid_disks) {
5155 rdev->raid_disk = info->raid_disk; 5227 rdev->raid_disk = info->raid_disk;
5156 else 5228 set_bit(In_sync, &rdev->flags);
5229 } else
5157 rdev->raid_disk = -1; 5230 rdev->raid_disk = -1;
5158 } else 5231 } else
5159 super_types[mddev->major_version]. 5232 super_types[mddev->major_version].
5160 validate_super(mddev, rdev); 5233 validate_super(mddev, rdev);
5161 rdev->saved_raid_disk = rdev->raid_disk; 5234 if ((info->state & (1<<MD_DISK_SYNC)) &&
5235 (!test_bit(In_sync, &rdev->flags) ||
5236 rdev->raid_disk != info->raid_disk)) {
5237 /* This was a hot-add request, but events doesn't
5238 * match, so reject it.
5239 */
5240 export_rdev(rdev);
5241 return -EINVAL;
5242 }
5243
5244 if (test_bit(In_sync, &rdev->flags))
5245 rdev->saved_raid_disk = rdev->raid_disk;
5246 else
5247 rdev->saved_raid_disk = -1;
5162 5248
5163 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 5249 clear_bit(In_sync, &rdev->flags); /* just to be sure */
5164 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5250 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
@@ -5188,6 +5274,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5188 if (mddev->degraded) 5274 if (mddev->degraded)
5189 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5275 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5190 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5276 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5277 if (!err)
5278 md_new_event(mddev);
5191 md_wakeup_thread(mddev->thread); 5279 md_wakeup_thread(mddev->thread);
5192 return err; 5280 return err;
5193 } 5281 }
@@ -5225,9 +5313,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5225 5313
5226 if (!mddev->persistent) { 5314 if (!mddev->persistent) {
5227 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 5315 printk(KERN_INFO "md: nonpersistent superblock ...\n");
5228 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 5316 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5229 } else 5317 } else
5230 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5318 rdev->sb_start = calc_dev_sboffset(rdev);
5231 rdev->sectors = rdev->sb_start; 5319 rdev->sectors = rdev->sb_start;
5232 5320
5233 err = bind_rdev_to_array(rdev, mddev); 5321 err = bind_rdev_to_array(rdev, mddev);
@@ -5294,9 +5382,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
5294 } 5382 }
5295 5383
5296 if (mddev->persistent) 5384 if (mddev->persistent)
5297 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5385 rdev->sb_start = calc_dev_sboffset(rdev);
5298 else 5386 else
5299 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 5387 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5300 5388
5301 rdev->sectors = rdev->sb_start; 5389 rdev->sectors = rdev->sb_start;
5302 5390
@@ -5507,7 +5595,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
5507 * sb_start or, if that is <data_offset, it must fit before the size 5595 * sb_start or, if that is <data_offset, it must fit before the size
5508 * of each device. If num_sectors is zero, we find the largest size 5596 * of each device. If num_sectors is zero, we find the largest size
5509 * that fits. 5597 * that fits.
5510
5511 */ 5598 */
5512 if (mddev->sync_thread) 5599 if (mddev->sync_thread)
5513 return -EBUSY; 5600 return -EBUSY;
@@ -5544,6 +5631,8 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
5544 mddev->delta_disks = raid_disks - mddev->raid_disks; 5631 mddev->delta_disks = raid_disks - mddev->raid_disks;
5545 5632
5546 rv = mddev->pers->check_reshape(mddev); 5633 rv = mddev->pers->check_reshape(mddev);
5634 if (rv < 0)
5635 mddev->delta_disks = 0;
5547 return rv; 5636 return rv;
5548} 5637}
5549 5638
@@ -5951,16 +6040,14 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5951 mddev_t *mddev = mddev_find(bdev->bd_dev); 6040 mddev_t *mddev = mddev_find(bdev->bd_dev);
5952 int err; 6041 int err;
5953 6042
5954 lock_kernel();
5955 if (mddev->gendisk != bdev->bd_disk) { 6043 if (mddev->gendisk != bdev->bd_disk) {
5956 /* we are racing with mddev_put which is discarding this 6044 /* we are racing with mddev_put which is discarding this
5957 * bd_disk. 6045 * bd_disk.
5958 */ 6046 */
5959 mddev_put(mddev); 6047 mddev_put(mddev);
5960 /* Wait until bdev->bd_disk is definitely gone */ 6048 /* Wait until bdev->bd_disk is definitely gone */
5961 flush_scheduled_work(); 6049 flush_workqueue(md_misc_wq);
5962 /* Then retry the open from the top */ 6050 /* Then retry the open from the top */
5963 unlock_kernel();
5964 return -ERESTARTSYS; 6051 return -ERESTARTSYS;
5965 } 6052 }
5966 BUG_ON(mddev != bdev->bd_disk->private_data); 6053 BUG_ON(mddev != bdev->bd_disk->private_data);
@@ -5972,9 +6059,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5972 atomic_inc(&mddev->openers); 6059 atomic_inc(&mddev->openers);
5973 mutex_unlock(&mddev->open_mutex); 6060 mutex_unlock(&mddev->open_mutex);
5974 6061
5975 check_disk_size_change(mddev->gendisk, bdev); 6062 check_disk_change(bdev);
5976 out: 6063 out:
5977 unlock_kernel();
5978 return err; 6064 return err;
5979} 6065}
5980 6066
@@ -5983,13 +6069,26 @@ static int md_release(struct gendisk *disk, fmode_t mode)
5983 mddev_t *mddev = disk->private_data; 6069 mddev_t *mddev = disk->private_data;
5984 6070
5985 BUG_ON(!mddev); 6071 BUG_ON(!mddev);
5986 lock_kernel();
5987 atomic_dec(&mddev->openers); 6072 atomic_dec(&mddev->openers);
5988 mddev_put(mddev); 6073 mddev_put(mddev);
5989 unlock_kernel();
5990 6074
5991 return 0; 6075 return 0;
5992} 6076}
6077
6078static int md_media_changed(struct gendisk *disk)
6079{
6080 mddev_t *mddev = disk->private_data;
6081
6082 return mddev->changed;
6083}
6084
6085static int md_revalidate(struct gendisk *disk)
6086{
6087 mddev_t *mddev = disk->private_data;
6088
6089 mddev->changed = 0;
6090 return 0;
6091}
5993static const struct block_device_operations md_fops = 6092static const struct block_device_operations md_fops =
5994{ 6093{
5995 .owner = THIS_MODULE, 6094 .owner = THIS_MODULE,
@@ -6000,6 +6099,8 @@ static const struct block_device_operations md_fops =
6000 .compat_ioctl = md_compat_ioctl, 6099 .compat_ioctl = md_compat_ioctl,
6001#endif 6100#endif
6002 .getgeo = md_getgeo, 6101 .getgeo = md_getgeo,
6102 .media_changed = md_media_changed,
6103 .revalidate_disk= md_revalidate,
6003}; 6104};
6004 6105
6005static int md_thread(void * arg) 6106static int md_thread(void * arg)
@@ -6036,8 +6137,8 @@ static int md_thread(void * arg)
6036 thread->timeout); 6137 thread->timeout);
6037 6138
6038 clear_bit(THREAD_WAKEUP, &thread->flags); 6139 clear_bit(THREAD_WAKEUP, &thread->flags);
6039 6140 if (!kthread_should_stop())
6040 thread->run(thread->mddev); 6141 thread->run(thread->mddev);
6041 } 6142 }
6042 6143
6043 return 0; 6144 return 0;
@@ -6118,7 +6219,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6118 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6219 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6119 md_wakeup_thread(mddev->thread); 6220 md_wakeup_thread(mddev->thread);
6120 if (mddev->event_work.func) 6221 if (mddev->event_work.func)
6121 schedule_work(&mddev->event_work); 6222 queue_work(md_misc_wq, &mddev->event_work);
6122 md_new_event_inintr(mddev); 6223 md_new_event_inintr(mddev);
6123} 6224}
6124 6225
@@ -6209,7 +6310,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
6209 * rt is a sector_t, so could be 32bit or 64bit. 6310 * rt is a sector_t, so could be 32bit or 64bit.
6210 * So we divide before multiply in case it is 32bit and close 6311 * So we divide before multiply in case it is 32bit and close
6211 * to the limit. 6312 * to the limit.
6212 * We scale the divisor (db) by 32 to avoid loosing precision 6313 * We scale the divisor (db) by 32 to avoid losing precision
6213 * near the end of resync when the number of remaining sectors 6314 * near the end of resync when the number of remaining sectors
6214 * is close to 'db'. 6315 * is close to 'db'.
6215 * We then divide rt by 32 after multiplying by db to compensate. 6316 * We then divide rt by 32 after multiplying by db to compensate.
@@ -6631,14 +6732,6 @@ int md_allow_write(mddev_t *mddev)
6631} 6732}
6632EXPORT_SYMBOL_GPL(md_allow_write); 6733EXPORT_SYMBOL_GPL(md_allow_write);
6633 6734
6634void md_unplug(mddev_t *mddev)
6635{
6636 if (mddev->queue)
6637 blk_unplug(mddev->queue);
6638 if (mddev->plug)
6639 mddev->plug->unplug_fn(mddev->plug);
6640}
6641
6642#define SYNC_MARKS 10 6735#define SYNC_MARKS 10
6643#define SYNC_MARK_STEP (3*HZ) 6736#define SYNC_MARK_STEP (3*HZ)
6644void md_do_sync(mddev_t *mddev) 6737void md_do_sync(mddev_t *mddev)
@@ -6790,8 +6883,8 @@ void md_do_sync(mddev_t *mddev)
6790 * Tune reconstruction: 6883 * Tune reconstruction:
6791 */ 6884 */
6792 window = 32*(PAGE_SIZE/512); 6885 window = 32*(PAGE_SIZE/512);
6793 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 6886 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
6794 window/2,(unsigned long long) max_sectors/2); 6887 window/2, (unsigned long long)max_sectors/2);
6795 6888
6796 atomic_set(&mddev->recovery_active, 0); 6889 atomic_set(&mddev->recovery_active, 0);
6797 last_check = 0; 6890 last_check = 0;
@@ -6802,7 +6895,7 @@ void md_do_sync(mddev_t *mddev)
6802 desc, mdname(mddev)); 6895 desc, mdname(mddev));
6803 mddev->curr_resync = j; 6896 mddev->curr_resync = j;
6804 } 6897 }
6805 mddev->curr_resync_completed = mddev->curr_resync; 6898 mddev->curr_resync_completed = j;
6806 6899
6807 while (j < max_sectors) { 6900 while (j < max_sectors) {
6808 sector_t sectors; 6901 sector_t sectors;
@@ -6817,11 +6910,9 @@ void md_do_sync(mddev_t *mddev)
6817 >= mddev->resync_max - mddev->curr_resync_completed 6910 >= mddev->resync_max - mddev->curr_resync_completed
6818 )) { 6911 )) {
6819 /* time to update curr_resync_completed */ 6912 /* time to update curr_resync_completed */
6820 md_unplug(mddev);
6821 wait_event(mddev->recovery_wait, 6913 wait_event(mddev->recovery_wait,
6822 atomic_read(&mddev->recovery_active) == 0); 6914 atomic_read(&mddev->recovery_active) == 0);
6823 mddev->curr_resync_completed = 6915 mddev->curr_resync_completed = j;
6824 mddev->curr_resync;
6825 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6916 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6826 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6917 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6827 } 6918 }
@@ -6894,7 +6985,6 @@ void md_do_sync(mddev_t *mddev)
6894 * about not overloading the IO subsystem. (things like an 6985 * about not overloading the IO subsystem. (things like an
6895 * e2fsck being done on the RAID array should execute fast) 6986 * e2fsck being done on the RAID array should execute fast)
6896 */ 6987 */
6897 md_unplug(mddev);
6898 cond_resched(); 6988 cond_resched();
6899 6989
6900 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 6990 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -6913,8 +7003,6 @@ void md_do_sync(mddev_t *mddev)
6913 * this also signals 'finished resyncing' to md_stop 7003 * this also signals 'finished resyncing' to md_stop
6914 */ 7004 */
6915 out: 7005 out:
6916 md_unplug(mddev);
6917
6918 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7006 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6919 7007
6920 /* tell personality that we are finished */ 7008 /* tell personality that we are finished */
@@ -6957,9 +7045,6 @@ void md_do_sync(mddev_t *mddev)
6957 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7045 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6958 mddev->resync_min = mddev->curr_resync_completed; 7046 mddev->resync_min = mddev->curr_resync_completed;
6959 mddev->curr_resync = 0; 7047 mddev->curr_resync = 0;
6960 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6961 mddev->curr_resync_completed = 0;
6962 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6963 wake_up(&resync_wait); 7048 wake_up(&resync_wait);
6964 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 7049 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
6965 md_wakeup_thread(mddev->thread); 7050 md_wakeup_thread(mddev->thread);
@@ -6977,7 +7062,6 @@ void md_do_sync(mddev_t *mddev)
6977} 7062}
6978EXPORT_SYMBOL_GPL(md_do_sync); 7063EXPORT_SYMBOL_GPL(md_do_sync);
6979 7064
6980
6981static int remove_and_add_spares(mddev_t *mddev) 7065static int remove_and_add_spares(mddev_t *mddev)
6982{ 7066{
6983 mdk_rdev_t *rdev; 7067 mdk_rdev_t *rdev;
@@ -7000,10 +7084,11 @@ static int remove_and_add_spares(mddev_t *mddev)
7000 } 7084 }
7001 } 7085 }
7002 7086
7003 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { 7087 if (mddev->degraded && !mddev->recovery_disabled) {
7004 list_for_each_entry(rdev, &mddev->disks, same_set) { 7088 list_for_each_entry(rdev, &mddev->disks, same_set) {
7005 if (rdev->raid_disk >= 0 && 7089 if (rdev->raid_disk >= 0 &&
7006 !test_bit(In_sync, &rdev->flags) && 7090 !test_bit(In_sync, &rdev->flags) &&
7091 !test_bit(Faulty, &rdev->flags) &&
7007 !test_bit(Blocked, &rdev->flags)) 7092 !test_bit(Blocked, &rdev->flags))
7008 spares++; 7093 spares++;
7009 if (rdev->raid_disk < 0 7094 if (rdev->raid_disk < 0
@@ -7026,6 +7111,45 @@ static int remove_and_add_spares(mddev_t *mddev)
7026 } 7111 }
7027 return spares; 7112 return spares;
7028} 7113}
7114
7115static void reap_sync_thread(mddev_t *mddev)
7116{
7117 mdk_rdev_t *rdev;
7118
7119 /* resync has finished, collect result */
7120 md_unregister_thread(mddev->sync_thread);
7121 mddev->sync_thread = NULL;
7122 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7123 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7124 /* success...*/
7125 /* activate any spares */
7126 if (mddev->pers->spare_active(mddev))
7127 sysfs_notify(&mddev->kobj, NULL,
7128 "degraded");
7129 }
7130 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7131 mddev->pers->finish_reshape)
7132 mddev->pers->finish_reshape(mddev);
7133 md_update_sb(mddev, 1);
7134
7135 /* if array is no-longer degraded, then any saved_raid_disk
7136 * information must be scrapped
7137 */
7138 if (!mddev->degraded)
7139 list_for_each_entry(rdev, &mddev->disks, same_set)
7140 rdev->saved_raid_disk = -1;
7141
7142 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7143 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7144 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7145 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7146 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7147 /* flag recovery needed just to double check */
7148 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7149 sysfs_notify_dirent_safe(mddev->sysfs_action);
7150 md_new_event(mddev);
7151}
7152
7029/* 7153/*
7030 * This routine is regularly called by all per-raid-array threads to 7154 * This routine is regularly called by all per-raid-array threads to
7031 * deal with generic issues like resync and super-block update. 7155 * deal with generic issues like resync and super-block update.
@@ -7050,8 +7174,8 @@ static int remove_and_add_spares(mddev_t *mddev)
7050 */ 7174 */
7051void md_check_recovery(mddev_t *mddev) 7175void md_check_recovery(mddev_t *mddev)
7052{ 7176{
7053 mdk_rdev_t *rdev; 7177 if (mddev->suspended)
7054 7178 return;
7055 7179
7056 if (mddev->bitmap) 7180 if (mddev->bitmap)
7057 bitmap_daemon_work(mddev); 7181 bitmap_daemon_work(mddev);
@@ -7087,7 +7211,20 @@ void md_check_recovery(mddev_t *mddev)
7087 /* Only thing we do on a ro array is remove 7211 /* Only thing we do on a ro array is remove
7088 * failed devices. 7212 * failed devices.
7089 */ 7213 */
7090 remove_and_add_spares(mddev); 7214 mdk_rdev_t *rdev;
7215 list_for_each_entry(rdev, &mddev->disks, same_set)
7216 if (rdev->raid_disk >= 0 &&
7217 !test_bit(Blocked, &rdev->flags) &&
7218 test_bit(Faulty, &rdev->flags) &&
7219 atomic_read(&rdev->nr_pending)==0) {
7220 if (mddev->pers->hot_remove_disk(
7221 mddev, rdev->raid_disk)==0) {
7222 char nm[20];
7223 sprintf(nm,"rd%d", rdev->raid_disk);
7224 sysfs_remove_link(&mddev->kobj, nm);
7225 rdev->raid_disk = -1;
7226 }
7227 }
7091 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7228 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7092 goto unlock; 7229 goto unlock;
7093 } 7230 }
@@ -7120,34 +7257,7 @@ void md_check_recovery(mddev_t *mddev)
7120 goto unlock; 7257 goto unlock;
7121 } 7258 }
7122 if (mddev->sync_thread) { 7259 if (mddev->sync_thread) {
7123 /* resync has finished, collect result */ 7260 reap_sync_thread(mddev);
7124 md_unregister_thread(mddev->sync_thread);
7125 mddev->sync_thread = NULL;
7126 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7127 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7128 /* success...*/
7129 /* activate any spares */
7130 if (mddev->pers->spare_active(mddev))
7131 sysfs_notify(&mddev->kobj, NULL,
7132 "degraded");
7133 }
7134 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7135 mddev->pers->finish_reshape)
7136 mddev->pers->finish_reshape(mddev);
7137 md_update_sb(mddev, 1);
7138
7139 /* if array is no-longer degraded, then any saved_raid_disk
7140 * information must be scrapped
7141 */
7142 if (!mddev->degraded)
7143 list_for_each_entry(rdev, &mddev->disks, same_set)
7144 rdev->saved_raid_disk = -1;
7145
7146 mddev->recovery = 0;
7147 /* flag recovery needed just to double check */
7148 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7149 sysfs_notify_dirent_safe(mddev->sysfs_action);
7150 md_new_event(mddev);
7151 goto unlock; 7261 goto unlock;
7152 } 7262 }
7153 /* Set RUNNING before clearing NEEDED to avoid 7263 /* Set RUNNING before clearing NEEDED to avoid
@@ -7205,7 +7315,11 @@ void md_check_recovery(mddev_t *mddev)
7205 " thread...\n", 7315 " thread...\n",
7206 mdname(mddev)); 7316 mdname(mddev));
7207 /* leave the spares where they are, it shouldn't hurt */ 7317 /* leave the spares where they are, it shouldn't hurt */
7208 mddev->recovery = 0; 7318 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7319 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7320 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7321 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7322 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7209 } else 7323 } else
7210 md_wakeup_thread(mddev->sync_thread); 7324 md_wakeup_thread(mddev->sync_thread);
7211 sysfs_notify_dirent_safe(mddev->sysfs_action); 7325 sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -7278,12 +7392,23 @@ static void md_geninit(void)
7278 7392
7279static int __init md_init(void) 7393static int __init md_init(void)
7280{ 7394{
7281 if (register_blkdev(MD_MAJOR, "md")) 7395 int ret = -ENOMEM;
7282 return -1; 7396
7283 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 7397 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
7284 unregister_blkdev(MD_MAJOR, "md"); 7398 if (!md_wq)
7285 return -1; 7399 goto err_wq;
7286 } 7400
7401 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
7402 if (!md_misc_wq)
7403 goto err_misc_wq;
7404
7405 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
7406 goto err_md;
7407
7408 if ((ret = register_blkdev(0, "mdp")) < 0)
7409 goto err_mdp;
7410 mdp_major = ret;
7411
7287 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 7412 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
7288 md_probe, NULL, NULL); 7413 md_probe, NULL, NULL);
7289 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 7414 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
@@ -7294,8 +7419,16 @@ static int __init md_init(void)
7294 7419
7295 md_geninit(); 7420 md_geninit();
7296 return 0; 7421 return 0;
7297}
7298 7422
7423err_mdp:
7424 unregister_blkdev(MD_MAJOR, "md");
7425err_md:
7426 destroy_workqueue(md_misc_wq);
7427err_misc_wq:
7428 destroy_workqueue(md_wq);
7429err_wq:
7430 return ret;
7431}
7299 7432
7300#ifndef MODULE 7433#ifndef MODULE
7301 7434
@@ -7382,6 +7515,8 @@ static __exit void md_exit(void)
7382 export_array(mddev); 7515 export_array(mddev);
7383 mddev->hold_active = 0; 7516 mddev->hold_active = 0;
7384 } 7517 }
7518 destroy_workqueue(md_misc_wq);
7519 destroy_workqueue(md_wq);
7385} 7520}
7386 7521
7387subsys_initcall(md_init); 7522subsys_initcall(md_init);