aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c34
-rw-r--r--drivers/md/md.c636
-rw-r--r--drivers/md/multipath.c31
-rw-r--r--drivers/md/raid1.c216
-rw-r--r--drivers/md/raid10.c63
-rw-r--r--drivers/md/raid5.c261
-rw-r--r--drivers/md/raid6main.c40
7 files changed, 952 insertions, 329 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 01654fcabc52..51315302a85e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -21,7 +21,6 @@
21 */ 21 */
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/version.h>
25#include <linux/errno.h> 24#include <linux/errno.h>
26#include <linux/slab.h> 25#include <linux/slab.h>
27#include <linux/init.h> 26#include <linux/init.h>
@@ -272,7 +271,8 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
272 return ERR_PTR(-ENOMEM); 271 return ERR_PTR(-ENOMEM);
273 272
274 ITERATE_RDEV(mddev, rdev, tmp) { 273 ITERATE_RDEV(mddev, rdev, tmp) {
275 if (! rdev->in_sync || rdev->faulty) 274 if (! test_bit(In_sync, &rdev->flags)
275 || test_bit(Faulty, &rdev->flags))
276 continue; 276 continue;
277 277
278 target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); 278 target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
@@ -292,7 +292,8 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai
292 struct list_head *tmp; 292 struct list_head *tmp;
293 293
294 ITERATE_RDEV(mddev, rdev, tmp) 294 ITERATE_RDEV(mddev, rdev, tmp)
295 if (rdev->in_sync && !rdev->faulty) 295 if (test_bit(In_sync, &rdev->flags)
296 && !test_bit(Faulty, &rdev->flags))
296 md_super_write(mddev, rdev, 297 md_super_write(mddev, rdev,
297 (rdev->sb_offset<<1) + offset 298 (rdev->sb_offset<<1) + offset
298 + page->index * (PAGE_SIZE/512), 299 + page->index * (PAGE_SIZE/512),
@@ -300,7 +301,7 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai
300 page); 301 page);
301 302
302 if (wait) 303 if (wait)
303 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 304 md_super_wait(mddev);
304 return 0; 305 return 0;
305} 306}
306 307
@@ -481,7 +482,8 @@ static int bitmap_read_sb(struct bitmap *bitmap)
481 /* verify that the bitmap-specific fields are valid */ 482 /* verify that the bitmap-specific fields are valid */
482 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) 483 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
483 reason = "bad magic"; 484 reason = "bad magic";
484 else if (sb->version != cpu_to_le32(BITMAP_MAJOR)) 485 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
486 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
485 reason = "unrecognized superblock version"; 487 reason = "unrecognized superblock version";
486 else if (chunksize < 512 || chunksize > (1024 * 1024 * 4)) 488 else if (chunksize < 512 || chunksize > (1024 * 1024 * 4))
487 reason = "bitmap chunksize out of range (512B - 4MB)"; 489 reason = "bitmap chunksize out of range (512B - 4MB)";
@@ -526,6 +528,8 @@ success:
526 bitmap->daemon_lastrun = jiffies; 528 bitmap->daemon_lastrun = jiffies;
527 bitmap->max_write_behind = write_behind; 529 bitmap->max_write_behind = write_behind;
528 bitmap->flags |= sb->state; 530 bitmap->flags |= sb->state;
531 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
532 bitmap->flags |= BITMAP_HOSTENDIAN;
529 bitmap->events_cleared = le64_to_cpu(sb->events_cleared); 533 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
530 if (sb->state & BITMAP_STALE) 534 if (sb->state & BITMAP_STALE)
531 bitmap->events_cleared = bitmap->mddev->events; 535 bitmap->events_cleared = bitmap->mddev->events;
@@ -763,7 +767,10 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
763 767
764 /* set the bit */ 768 /* set the bit */
765 kaddr = kmap_atomic(page, KM_USER0); 769 kaddr = kmap_atomic(page, KM_USER0);
766 set_bit(bit, kaddr); 770 if (bitmap->flags & BITMAP_HOSTENDIAN)
771 set_bit(bit, kaddr);
772 else
773 ext2_set_bit(bit, kaddr);
767 kunmap_atomic(kaddr, KM_USER0); 774 kunmap_atomic(kaddr, KM_USER0);
768 PRINTK("set file bit %lu page %lu\n", bit, page->index); 775 PRINTK("set file bit %lu page %lu\n", bit, page->index);
769 776
@@ -821,8 +828,7 @@ int bitmap_unplug(struct bitmap *bitmap)
821 wake_up_process(bitmap->writeback_daemon->tsk)); 828 wake_up_process(bitmap->writeback_daemon->tsk));
822 spin_unlock_irq(&bitmap->write_lock); 829 spin_unlock_irq(&bitmap->write_lock);
823 } else 830 } else
824 wait_event(bitmap->mddev->sb_wait, 831 md_super_wait(bitmap->mddev);
825 atomic_read(&bitmap->mddev->pending_writes)==0);
826 } 832 }
827 return 0; 833 return 0;
828} 834}
@@ -890,6 +896,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
890 oldindex = ~0L; 896 oldindex = ~0L;
891 897
892 for (i = 0; i < chunks; i++) { 898 for (i = 0; i < chunks; i++) {
899 int b;
893 index = file_page_index(i); 900 index = file_page_index(i);
894 bit = file_page_offset(i); 901 bit = file_page_offset(i);
895 if (index != oldindex) { /* this is a new page, read it in */ 902 if (index != oldindex) { /* this is a new page, read it in */
@@ -938,7 +945,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
938 945
939 bitmap->filemap[bitmap->file_pages++] = page; 946 bitmap->filemap[bitmap->file_pages++] = page;
940 } 947 }
941 if (test_bit(bit, page_address(page))) { 948 if (bitmap->flags & BITMAP_HOSTENDIAN)
949 b = test_bit(bit, page_address(page));
950 else
951 b = ext2_test_bit(bit, page_address(page));
952 if (b) {
942 /* if the disk bit is set, set the memory bit */ 953 /* if the disk bit is set, set the memory bit */
943 bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), 954 bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap),
944 ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start) 955 ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start)
@@ -1096,7 +1107,10 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1096 -1); 1107 -1);
1097 1108
1098 /* clear the bit */ 1109 /* clear the bit */
1099 clear_bit(file_page_offset(j), page_address(page)); 1110 if (bitmap->flags & BITMAP_HOSTENDIAN)
1111 clear_bit(file_page_offset(j), page_address(page));
1112 else
1113 ext2_clear_bit(file_page_offset(j), page_address(page));
1100 } 1114 }
1101 } 1115 }
1102 spin_unlock_irqrestore(&bitmap->lock, flags); 1116 spin_unlock_irqrestore(&bitmap->lock, flags);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9ecf51ee596f..adf960d8a7c9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -131,6 +131,8 @@ static ctl_table raid_root_table[] = {
131 131
132static struct block_device_operations md_fops; 132static struct block_device_operations md_fops;
133 133
134static int start_readonly;
135
134/* 136/*
135 * Enables to iterate over all existing md arrays 137 * Enables to iterate over all existing md arrays
136 * all_mddevs_lock protects this list. 138 * all_mddevs_lock protects this list.
@@ -181,7 +183,7 @@ static void mddev_put(mddev_t *mddev)
181 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 183 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
182 list_del(&mddev->all_mddevs); 184 list_del(&mddev->all_mddevs);
183 blk_put_queue(mddev->queue); 185 blk_put_queue(mddev->queue);
184 kfree(mddev); 186 kobject_unregister(&mddev->kobj);
185 } 187 }
186 spin_unlock(&all_mddevs_lock); 188 spin_unlock(&all_mddevs_lock);
187} 189}
@@ -330,18 +332,46 @@ static void free_disk_sb(mdk_rdev_t * rdev)
330static int super_written(struct bio *bio, unsigned int bytes_done, int error) 332static int super_written(struct bio *bio, unsigned int bytes_done, int error)
331{ 333{
332 mdk_rdev_t *rdev = bio->bi_private; 334 mdk_rdev_t *rdev = bio->bi_private;
335 mddev_t *mddev = rdev->mddev;
333 if (bio->bi_size) 336 if (bio->bi_size)
334 return 1; 337 return 1;
335 338
336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 339 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
337 md_error(rdev->mddev, rdev); 340 md_error(mddev, rdev);
338 341
339 if (atomic_dec_and_test(&rdev->mddev->pending_writes)) 342 if (atomic_dec_and_test(&mddev->pending_writes))
340 wake_up(&rdev->mddev->sb_wait); 343 wake_up(&mddev->sb_wait);
341 bio_put(bio); 344 bio_put(bio);
342 return 0; 345 return 0;
343} 346}
344 347
348static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
349{
350 struct bio *bio2 = bio->bi_private;
351 mdk_rdev_t *rdev = bio2->bi_private;
352 mddev_t *mddev = rdev->mddev;
353 if (bio->bi_size)
354 return 1;
355
356 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
357 error == -EOPNOTSUPP) {
358 unsigned long flags;
359 /* barriers don't appear to be supported :-( */
360 set_bit(BarriersNotsupp, &rdev->flags);
361 mddev->barriers_work = 0;
362 spin_lock_irqsave(&mddev->write_lock, flags);
363 bio2->bi_next = mddev->biolist;
364 mddev->biolist = bio2;
365 spin_unlock_irqrestore(&mddev->write_lock, flags);
366 wake_up(&mddev->sb_wait);
367 bio_put(bio);
368 return 0;
369 }
370 bio_put(bio2);
371 bio->bi_private = rdev;
372 return super_written(bio, bytes_done, error);
373}
374
345void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 375void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
346 sector_t sector, int size, struct page *page) 376 sector_t sector, int size, struct page *page)
347{ 377{
@@ -350,16 +380,54 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
350 * and decrement it on completion, waking up sb_wait 380 * and decrement it on completion, waking up sb_wait
351 * if zero is reached. 381 * if zero is reached.
352 * If an error occurred, call md_error 382 * If an error occurred, call md_error
383 *
384 * As we might need to resubmit the request if BIO_RW_BARRIER
385 * causes ENOTSUPP, we allocate a spare bio...
353 */ 386 */
354 struct bio *bio = bio_alloc(GFP_NOIO, 1); 387 struct bio *bio = bio_alloc(GFP_NOIO, 1);
388 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
355 389
356 bio->bi_bdev = rdev->bdev; 390 bio->bi_bdev = rdev->bdev;
357 bio->bi_sector = sector; 391 bio->bi_sector = sector;
358 bio_add_page(bio, page, size, 0); 392 bio_add_page(bio, page, size, 0);
359 bio->bi_private = rdev; 393 bio->bi_private = rdev;
360 bio->bi_end_io = super_written; 394 bio->bi_end_io = super_written;
395 bio->bi_rw = rw;
396
361 atomic_inc(&mddev->pending_writes); 397 atomic_inc(&mddev->pending_writes);
362 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); 398 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
399 struct bio *rbio;
400 rw |= (1<<BIO_RW_BARRIER);
401 rbio = bio_clone(bio, GFP_NOIO);
402 rbio->bi_private = bio;
403 rbio->bi_end_io = super_written_barrier;
404 submit_bio(rw, rbio);
405 } else
406 submit_bio(rw, bio);
407}
408
409void md_super_wait(mddev_t *mddev)
410{
411 /* wait for all superblock writes that were scheduled to complete.
412 * if any had to be retried (due to BARRIER problems), retry them
413 */
414 DEFINE_WAIT(wq);
415 for(;;) {
416 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
417 if (atomic_read(&mddev->pending_writes)==0)
418 break;
419 while (mddev->biolist) {
420 struct bio *bio;
421 spin_lock_irq(&mddev->write_lock);
422 bio = mddev->biolist;
423 mddev->biolist = bio->bi_next ;
424 bio->bi_next = NULL;
425 spin_unlock_irq(&mddev->write_lock);
426 submit_bio(bio->bi_rw, bio);
427 }
428 schedule();
429 }
430 finish_wait(&mddev->sb_wait, &wq);
363} 431}
364 432
365static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 433static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
@@ -610,7 +678,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
610 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 678 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
611 679
612 rdev->raid_disk = -1; 680 rdev->raid_disk = -1;
613 rdev->in_sync = 0; 681 rdev->flags = 0;
614 if (mddev->raid_disks == 0) { 682 if (mddev->raid_disks == 0) {
615 mddev->major_version = 0; 683 mddev->major_version = 0;
616 mddev->minor_version = sb->minor_version; 684 mddev->minor_version = sb->minor_version;
@@ -671,21 +739,19 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
671 return 0; 739 return 0;
672 740
673 if (mddev->level != LEVEL_MULTIPATH) { 741 if (mddev->level != LEVEL_MULTIPATH) {
674 rdev->faulty = 0;
675 rdev->flags = 0;
676 desc = sb->disks + rdev->desc_nr; 742 desc = sb->disks + rdev->desc_nr;
677 743
678 if (desc->state & (1<<MD_DISK_FAULTY)) 744 if (desc->state & (1<<MD_DISK_FAULTY))
679 rdev->faulty = 1; 745 set_bit(Faulty, &rdev->flags);
680 else if (desc->state & (1<<MD_DISK_SYNC) && 746 else if (desc->state & (1<<MD_DISK_SYNC) &&
681 desc->raid_disk < mddev->raid_disks) { 747 desc->raid_disk < mddev->raid_disks) {
682 rdev->in_sync = 1; 748 set_bit(In_sync, &rdev->flags);
683 rdev->raid_disk = desc->raid_disk; 749 rdev->raid_disk = desc->raid_disk;
684 } 750 }
685 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 751 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
686 set_bit(WriteMostly, &rdev->flags); 752 set_bit(WriteMostly, &rdev->flags);
687 } else /* MULTIPATH are always insync */ 753 } else /* MULTIPATH are always insync */
688 rdev->in_sync = 1; 754 set_bit(In_sync, &rdev->flags);
689 return 0; 755 return 0;
690} 756}
691 757
@@ -699,6 +765,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
699 mdk_rdev_t *rdev2; 765 mdk_rdev_t *rdev2;
700 int next_spare = mddev->raid_disks; 766 int next_spare = mddev->raid_disks;
701 767
768
702 /* make rdev->sb match mddev data.. 769 /* make rdev->sb match mddev data..
703 * 770 *
704 * 1/ zero out disks 771 * 1/ zero out disks
@@ -758,23 +825,27 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
758 sb->disks[0].state = (1<<MD_DISK_REMOVED); 825 sb->disks[0].state = (1<<MD_DISK_REMOVED);
759 ITERATE_RDEV(mddev,rdev2,tmp) { 826 ITERATE_RDEV(mddev,rdev2,tmp) {
760 mdp_disk_t *d; 827 mdp_disk_t *d;
761 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 828 int desc_nr;
762 rdev2->desc_nr = rdev2->raid_disk; 829 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
830 && !test_bit(Faulty, &rdev2->flags))
831 desc_nr = rdev2->raid_disk;
763 else 832 else
764 rdev2->desc_nr = next_spare++; 833 desc_nr = next_spare++;
834 rdev2->desc_nr = desc_nr;
765 d = &sb->disks[rdev2->desc_nr]; 835 d = &sb->disks[rdev2->desc_nr];
766 nr_disks++; 836 nr_disks++;
767 d->number = rdev2->desc_nr; 837 d->number = rdev2->desc_nr;
768 d->major = MAJOR(rdev2->bdev->bd_dev); 838 d->major = MAJOR(rdev2->bdev->bd_dev);
769 d->minor = MINOR(rdev2->bdev->bd_dev); 839 d->minor = MINOR(rdev2->bdev->bd_dev);
770 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) 840 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
841 && !test_bit(Faulty, &rdev2->flags))
771 d->raid_disk = rdev2->raid_disk; 842 d->raid_disk = rdev2->raid_disk;
772 else 843 else
773 d->raid_disk = rdev2->desc_nr; /* compatibility */ 844 d->raid_disk = rdev2->desc_nr; /* compatibility */
774 if (rdev2->faulty) { 845 if (test_bit(Faulty, &rdev2->flags)) {
775 d->state = (1<<MD_DISK_FAULTY); 846 d->state = (1<<MD_DISK_FAULTY);
776 failed++; 847 failed++;
777 } else if (rdev2->in_sync) { 848 } else if (test_bit(In_sync, &rdev2->flags)) {
778 d->state = (1<<MD_DISK_ACTIVE); 849 d->state = (1<<MD_DISK_ACTIVE);
779 d->state |= (1<<MD_DISK_SYNC); 850 d->state |= (1<<MD_DISK_SYNC);
780 active++; 851 active++;
@@ -787,7 +858,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
787 if (test_bit(WriteMostly, &rdev2->flags)) 858 if (test_bit(WriteMostly, &rdev2->flags))
788 d->state |= (1<<MD_DISK_WRITEMOSTLY); 859 d->state |= (1<<MD_DISK_WRITEMOSTLY);
789 } 860 }
790
791 /* now set the "removed" and "faulty" bits on any missing devices */ 861 /* now set the "removed" and "faulty" bits on any missing devices */
792 for (i=0 ; i < mddev->raid_disks ; i++) { 862 for (i=0 ; i < mddev->raid_disks ; i++) {
793 mdp_disk_t *d = &sb->disks[i]; 863 mdp_disk_t *d = &sb->disks[i];
@@ -944,7 +1014,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
944 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1014 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
945 1015
946 rdev->raid_disk = -1; 1016 rdev->raid_disk = -1;
947 rdev->in_sync = 0; 1017 rdev->flags = 0;
948 if (mddev->raid_disks == 0) { 1018 if (mddev->raid_disks == 0) {
949 mddev->major_version = 1; 1019 mddev->major_version = 1;
950 mddev->patch_version = 0; 1020 mddev->patch_version = 0;
@@ -996,22 +1066,19 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
996 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1066 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
997 switch(role) { 1067 switch(role) {
998 case 0xffff: /* spare */ 1068 case 0xffff: /* spare */
999 rdev->faulty = 0;
1000 break; 1069 break;
1001 case 0xfffe: /* faulty */ 1070 case 0xfffe: /* faulty */
1002 rdev->faulty = 1; 1071 set_bit(Faulty, &rdev->flags);
1003 break; 1072 break;
1004 default: 1073 default:
1005 rdev->in_sync = 1; 1074 set_bit(In_sync, &rdev->flags);
1006 rdev->faulty = 0;
1007 rdev->raid_disk = role; 1075 rdev->raid_disk = role;
1008 break; 1076 break;
1009 } 1077 }
1010 rdev->flags = 0;
1011 if (sb->devflags & WriteMostly1) 1078 if (sb->devflags & WriteMostly1)
1012 set_bit(WriteMostly, &rdev->flags); 1079 set_bit(WriteMostly, &rdev->flags);
1013 } else /* MULTIPATH are always insync */ 1080 } else /* MULTIPATH are always insync */
1014 rdev->in_sync = 1; 1081 set_bit(In_sync, &rdev->flags);
1015 1082
1016 return 0; 1083 return 0;
1017} 1084}
@@ -1055,9 +1122,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1055 1122
1056 ITERATE_RDEV(mddev,rdev2,tmp) { 1123 ITERATE_RDEV(mddev,rdev2,tmp) {
1057 i = rdev2->desc_nr; 1124 i = rdev2->desc_nr;
1058 if (rdev2->faulty) 1125 if (test_bit(Faulty, &rdev2->flags))
1059 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1126 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1060 else if (rdev2->in_sync) 1127 else if (test_bit(In_sync, &rdev2->flags))
1061 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1128 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1062 else 1129 else
1063 sb->dev_roles[i] = cpu_to_le16(0xffff); 1130 sb->dev_roles[i] = cpu_to_le16(0xffff);
@@ -1115,6 +1182,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1115{ 1182{
1116 mdk_rdev_t *same_pdev; 1183 mdk_rdev_t *same_pdev;
1117 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1184 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1185 struct kobject *ko;
1118 1186
1119 if (rdev->mddev) { 1187 if (rdev->mddev) {
1120 MD_BUG(); 1188 MD_BUG();
@@ -1143,10 +1211,22 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1143 if (find_rdev_nr(mddev, rdev->desc_nr)) 1211 if (find_rdev_nr(mddev, rdev->desc_nr))
1144 return -EBUSY; 1212 return -EBUSY;
1145 } 1213 }
1214 bdevname(rdev->bdev,b);
1215 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1216 return -ENOMEM;
1146 1217
1147 list_add(&rdev->same_set, &mddev->disks); 1218 list_add(&rdev->same_set, &mddev->disks);
1148 rdev->mddev = mddev; 1219 rdev->mddev = mddev;
1149 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); 1220 printk(KERN_INFO "md: bind<%s>\n", b);
1221
1222 rdev->kobj.parent = &mddev->kobj;
1223 kobject_add(&rdev->kobj);
1224
1225 if (rdev->bdev->bd_part)
1226 ko = &rdev->bdev->bd_part->kobj;
1227 else
1228 ko = &rdev->bdev->bd_disk->kobj;
1229 sysfs_create_link(&rdev->kobj, ko, "block");
1150 return 0; 1230 return 0;
1151} 1231}
1152 1232
@@ -1160,6 +1240,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1160 list_del_init(&rdev->same_set); 1240 list_del_init(&rdev->same_set);
1161 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1241 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1162 rdev->mddev = NULL; 1242 rdev->mddev = NULL;
1243 sysfs_remove_link(&rdev->kobj, "block");
1244 kobject_del(&rdev->kobj);
1163} 1245}
1164 1246
1165/* 1247/*
@@ -1215,7 +1297,7 @@ static void export_rdev(mdk_rdev_t * rdev)
1215 md_autodetect_dev(rdev->bdev->bd_dev); 1297 md_autodetect_dev(rdev->bdev->bd_dev);
1216#endif 1298#endif
1217 unlock_rdev(rdev); 1299 unlock_rdev(rdev);
1218 kfree(rdev); 1300 kobject_put(&rdev->kobj);
1219} 1301}
1220 1302
1221static void kick_rdev_from_array(mdk_rdev_t * rdev) 1303static void kick_rdev_from_array(mdk_rdev_t * rdev)
@@ -1287,7 +1369,8 @@ static void print_rdev(mdk_rdev_t *rdev)
1287 char b[BDEVNAME_SIZE]; 1369 char b[BDEVNAME_SIZE];
1288 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1370 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1289 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1371 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1290 rdev->faulty, rdev->in_sync, rdev->desc_nr); 1372 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1373 rdev->desc_nr);
1291 if (rdev->sb_loaded) { 1374 if (rdev->sb_loaded) {
1292 printk(KERN_INFO "md: rdev superblock:\n"); 1375 printk(KERN_INFO "md: rdev superblock:\n");
1293 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1376 print_sb((mdp_super_t*)page_address(rdev->sb_page));
@@ -1344,7 +1427,7 @@ static void md_update_sb(mddev_t * mddev)
1344 int sync_req; 1427 int sync_req;
1345 1428
1346repeat: 1429repeat:
1347 spin_lock(&mddev->write_lock); 1430 spin_lock_irq(&mddev->write_lock);
1348 sync_req = mddev->in_sync; 1431 sync_req = mddev->in_sync;
1349 mddev->utime = get_seconds(); 1432 mddev->utime = get_seconds();
1350 mddev->events ++; 1433 mddev->events ++;
@@ -1367,11 +1450,11 @@ repeat:
1367 */ 1450 */
1368 if (!mddev->persistent) { 1451 if (!mddev->persistent) {
1369 mddev->sb_dirty = 0; 1452 mddev->sb_dirty = 0;
1370 spin_unlock(&mddev->write_lock); 1453 spin_unlock_irq(&mddev->write_lock);
1371 wake_up(&mddev->sb_wait); 1454 wake_up(&mddev->sb_wait);
1372 return; 1455 return;
1373 } 1456 }
1374 spin_unlock(&mddev->write_lock); 1457 spin_unlock_irq(&mddev->write_lock);
1375 1458
1376 dprintk(KERN_INFO 1459 dprintk(KERN_INFO
1377 "md: updating %s RAID superblock on device (in sync %d)\n", 1460 "md: updating %s RAID superblock on device (in sync %d)\n",
@@ -1381,11 +1464,11 @@ repeat:
1381 ITERATE_RDEV(mddev,rdev,tmp) { 1464 ITERATE_RDEV(mddev,rdev,tmp) {
1382 char b[BDEVNAME_SIZE]; 1465 char b[BDEVNAME_SIZE];
1383 dprintk(KERN_INFO "md: "); 1466 dprintk(KERN_INFO "md: ");
1384 if (rdev->faulty) 1467 if (test_bit(Faulty, &rdev->flags))
1385 dprintk("(skipping faulty "); 1468 dprintk("(skipping faulty ");
1386 1469
1387 dprintk("%s ", bdevname(rdev->bdev,b)); 1470 dprintk("%s ", bdevname(rdev->bdev,b));
1388 if (!rdev->faulty) { 1471 if (!test_bit(Faulty, &rdev->flags)) {
1389 md_super_write(mddev,rdev, 1472 md_super_write(mddev,rdev,
1390 rdev->sb_offset<<1, rdev->sb_size, 1473 rdev->sb_offset<<1, rdev->sb_size,
1391 rdev->sb_page); 1474 rdev->sb_page);
@@ -1399,21 +1482,106 @@ repeat:
1399 /* only need to write one superblock... */ 1482 /* only need to write one superblock... */
1400 break; 1483 break;
1401 } 1484 }
1402 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1485 md_super_wait(mddev);
1403 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1486 /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1404 1487
1405 spin_lock(&mddev->write_lock); 1488 spin_lock_irq(&mddev->write_lock);
1406 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1489 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1407 /* have to write it out again */ 1490 /* have to write it out again */
1408 spin_unlock(&mddev->write_lock); 1491 spin_unlock_irq(&mddev->write_lock);
1409 goto repeat; 1492 goto repeat;
1410 } 1493 }
1411 mddev->sb_dirty = 0; 1494 mddev->sb_dirty = 0;
1412 spin_unlock(&mddev->write_lock); 1495 spin_unlock_irq(&mddev->write_lock);
1413 wake_up(&mddev->sb_wait); 1496 wake_up(&mddev->sb_wait);
1414 1497
1415} 1498}
1416 1499
1500struct rdev_sysfs_entry {
1501 struct attribute attr;
1502 ssize_t (*show)(mdk_rdev_t *, char *);
1503 ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1504};
1505
1506static ssize_t
1507state_show(mdk_rdev_t *rdev, char *page)
1508{
1509 char *sep = "";
1510 int len=0;
1511
1512 if (test_bit(Faulty, &rdev->flags)) {
1513 len+= sprintf(page+len, "%sfaulty",sep);
1514 sep = ",";
1515 }
1516 if (test_bit(In_sync, &rdev->flags)) {
1517 len += sprintf(page+len, "%sin_sync",sep);
1518 sep = ",";
1519 }
1520 if (!test_bit(Faulty, &rdev->flags) &&
1521 !test_bit(In_sync, &rdev->flags)) {
1522 len += sprintf(page+len, "%sspare", sep);
1523 sep = ",";
1524 }
1525 return len+sprintf(page+len, "\n");
1526}
1527
1528static struct rdev_sysfs_entry
1529rdev_state = __ATTR_RO(state);
1530
1531static ssize_t
1532super_show(mdk_rdev_t *rdev, char *page)
1533{
1534 if (rdev->sb_loaded && rdev->sb_size) {
1535 memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1536 return rdev->sb_size;
1537 } else
1538 return 0;
1539}
1540static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1541
1542static struct attribute *rdev_default_attrs[] = {
1543 &rdev_state.attr,
1544 &rdev_super.attr,
1545 NULL,
1546};
1547static ssize_t
1548rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1549{
1550 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1551 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1552
1553 if (!entry->show)
1554 return -EIO;
1555 return entry->show(rdev, page);
1556}
1557
1558static ssize_t
1559rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1560 const char *page, size_t length)
1561{
1562 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1563 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1564
1565 if (!entry->store)
1566 return -EIO;
1567 return entry->store(rdev, page, length);
1568}
1569
1570static void rdev_free(struct kobject *ko)
1571{
1572 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1573 kfree(rdev);
1574}
1575static struct sysfs_ops rdev_sysfs_ops = {
1576 .show = rdev_attr_show,
1577 .store = rdev_attr_store,
1578};
1579static struct kobj_type rdev_ktype = {
1580 .release = rdev_free,
1581 .sysfs_ops = &rdev_sysfs_ops,
1582 .default_attrs = rdev_default_attrs,
1583};
1584
1417/* 1585/*
1418 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1586 * Import a device. If 'super_format' >= 0, then sanity check the superblock
1419 * 1587 *
@@ -1445,11 +1613,15 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1445 if (err) 1613 if (err)
1446 goto abort_free; 1614 goto abort_free;
1447 1615
1616 rdev->kobj.parent = NULL;
1617 rdev->kobj.ktype = &rdev_ktype;
1618 kobject_init(&rdev->kobj);
1619
1448 rdev->desc_nr = -1; 1620 rdev->desc_nr = -1;
1449 rdev->faulty = 0; 1621 rdev->flags = 0;
1450 rdev->in_sync = 0;
1451 rdev->data_offset = 0; 1622 rdev->data_offset = 0;
1452 atomic_set(&rdev->nr_pending, 0); 1623 atomic_set(&rdev->nr_pending, 0);
1624 atomic_set(&rdev->read_errors, 0);
1453 1625
1454 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1626 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1455 if (!size) { 1627 if (!size) {
@@ -1537,7 +1709,7 @@ static void analyze_sbs(mddev_t * mddev)
1537 if (mddev->level == LEVEL_MULTIPATH) { 1709 if (mddev->level == LEVEL_MULTIPATH) {
1538 rdev->desc_nr = i++; 1710 rdev->desc_nr = i++;
1539 rdev->raid_disk = rdev->desc_nr; 1711 rdev->raid_disk = rdev->desc_nr;
1540 rdev->in_sync = 1; 1712 set_bit(In_sync, &rdev->flags);
1541 } 1713 }
1542 } 1714 }
1543 1715
@@ -1551,6 +1723,162 @@ static void analyze_sbs(mddev_t * mddev)
1551 1723
1552} 1724}
1553 1725
1726static ssize_t
1727level_show(mddev_t *mddev, char *page)
1728{
1729 mdk_personality_t *p = mddev->pers;
1730 if (p == NULL && mddev->raid_disks == 0)
1731 return 0;
1732 if (mddev->level >= 0)
1733 return sprintf(page, "RAID-%d\n", mddev->level);
1734 else
1735 return sprintf(page, "%s\n", p->name);
1736}
1737
1738static struct md_sysfs_entry md_level = __ATTR_RO(level);
1739
1740static ssize_t
1741raid_disks_show(mddev_t *mddev, char *page)
1742{
1743 if (mddev->raid_disks == 0)
1744 return 0;
1745 return sprintf(page, "%d\n", mddev->raid_disks);
1746}
1747
1748static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks);
1749
1750static ssize_t
1751action_show(mddev_t *mddev, char *page)
1752{
1753 char *type = "idle";
1754 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1755 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
1756 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1757 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1758 type = "resync";
1759 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1760 type = "check";
1761 else
1762 type = "repair";
1763 } else
1764 type = "recover";
1765 }
1766 return sprintf(page, "%s\n", type);
1767}
1768
1769static ssize_t
1770action_store(mddev_t *mddev, const char *page, size_t len)
1771{
1772 if (!mddev->pers || !mddev->pers->sync_request)
1773 return -EINVAL;
1774
1775 if (strcmp(page, "idle")==0 || strcmp(page, "idle\n")==0) {
1776 if (mddev->sync_thread) {
1777 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1778 md_unregister_thread(mddev->sync_thread);
1779 mddev->sync_thread = NULL;
1780 mddev->recovery = 0;
1781 }
1782 return len;
1783 }
1784
1785 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1786 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
1787 return -EBUSY;
1788 if (strcmp(page, "resync")==0 || strcmp(page, "resync\n")==0 ||
1789 strcmp(page, "recover")==0 || strcmp(page, "recover\n")==0)
1790 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1791 else {
1792 if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0)
1793 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
1794 else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0)
1795 return -EINVAL;
1796 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
1797 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
1798 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1799 }
1800 md_wakeup_thread(mddev->thread);
1801 return len;
1802}
1803
1804static ssize_t
1805mismatch_cnt_show(mddev_t *mddev, char *page)
1806{
1807 return sprintf(page, "%llu\n",
1808 (unsigned long long) mddev->resync_mismatches);
1809}
1810
1811static struct md_sysfs_entry
1812md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
1813
1814
1815static struct md_sysfs_entry
1816md_mismatches = __ATTR_RO(mismatch_cnt);
1817
1818static struct attribute *md_default_attrs[] = {
1819 &md_level.attr,
1820 &md_raid_disks.attr,
1821 NULL,
1822};
1823
1824static struct attribute *md_redundancy_attrs[] = {
1825 &md_scan_mode.attr,
1826 &md_mismatches.attr,
1827 NULL,
1828};
1829static struct attribute_group md_redundancy_group = {
1830 .name = NULL,
1831 .attrs = md_redundancy_attrs,
1832};
1833
1834
1835static ssize_t
1836md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1837{
1838 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
1839 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
1840 ssize_t rv;
1841
1842 if (!entry->show)
1843 return -EIO;
1844 mddev_lock(mddev);
1845 rv = entry->show(mddev, page);
1846 mddev_unlock(mddev);
1847 return rv;
1848}
1849
1850static ssize_t
1851md_attr_store(struct kobject *kobj, struct attribute *attr,
1852 const char *page, size_t length)
1853{
1854 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
1855 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
1856 ssize_t rv;
1857
1858 if (!entry->store)
1859 return -EIO;
1860 mddev_lock(mddev);
1861 rv = entry->store(mddev, page, length);
1862 mddev_unlock(mddev);
1863 return rv;
1864}
1865
1866static void md_free(struct kobject *ko)
1867{
1868 mddev_t *mddev = container_of(ko, mddev_t, kobj);
1869 kfree(mddev);
1870}
1871
1872static struct sysfs_ops md_sysfs_ops = {
1873 .show = md_attr_show,
1874 .store = md_attr_store,
1875};
1876static struct kobj_type md_ktype = {
1877 .release = md_free,
1878 .sysfs_ops = &md_sysfs_ops,
1879 .default_attrs = md_default_attrs,
1880};
1881
1554int mdp_major = 0; 1882int mdp_major = 0;
1555 1883
1556static struct kobject *md_probe(dev_t dev, int *part, void *data) 1884static struct kobject *md_probe(dev_t dev, int *part, void *data)
@@ -1592,6 +1920,11 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
1592 add_disk(disk); 1920 add_disk(disk);
1593 mddev->gendisk = disk; 1921 mddev->gendisk = disk;
1594 up(&disks_sem); 1922 up(&disks_sem);
1923 mddev->kobj.parent = &disk->kobj;
1924 mddev->kobj.k_name = NULL;
1925 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
1926 mddev->kobj.ktype = &md_ktype;
1927 kobject_register(&mddev->kobj);
1595 return NULL; 1928 return NULL;
1596} 1929}
1597 1930
@@ -1663,7 +1996,7 @@ static int do_md_run(mddev_t * mddev)
1663 1996
1664 /* devices must have minimum size of one chunk */ 1997 /* devices must have minimum size of one chunk */
1665 ITERATE_RDEV(mddev,rdev,tmp) { 1998 ITERATE_RDEV(mddev,rdev,tmp) {
1666 if (rdev->faulty) 1999 if (test_bit(Faulty, &rdev->flags))
1667 continue; 2000 continue;
1668 if (rdev->size < chunk_size / 1024) { 2001 if (rdev->size < chunk_size / 1024) {
1669 printk(KERN_WARNING 2002 printk(KERN_WARNING
@@ -1691,7 +2024,7 @@ static int do_md_run(mddev_t * mddev)
1691 * Also find largest hardsector size 2024 * Also find largest hardsector size
1692 */ 2025 */
1693 ITERATE_RDEV(mddev,rdev,tmp) { 2026 ITERATE_RDEV(mddev,rdev,tmp) {
1694 if (rdev->faulty) 2027 if (test_bit(Faulty, &rdev->flags))
1695 continue; 2028 continue;
1696 sync_blockdev(rdev->bdev); 2029 sync_blockdev(rdev->bdev);
1697 invalidate_bdev(rdev->bdev, 0); 2030 invalidate_bdev(rdev->bdev, 0);
@@ -1715,6 +2048,10 @@ static int do_md_run(mddev_t * mddev)
1715 2048
1716 mddev->recovery = 0; 2049 mddev->recovery = 0;
1717 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2050 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
2051 mddev->barriers_work = 1;
2052
2053 if (start_readonly)
2054 mddev->ro = 2; /* read-only, but switch on first write */
1718 2055
1719 /* before we start the array running, initialise the bitmap */ 2056 /* before we start the array running, initialise the bitmap */
1720 err = bitmap_create(mddev); 2057 err = bitmap_create(mddev);
@@ -1730,12 +2067,24 @@ static int do_md_run(mddev_t * mddev)
1730 bitmap_destroy(mddev); 2067 bitmap_destroy(mddev);
1731 return err; 2068 return err;
1732 } 2069 }
2070 if (mddev->pers->sync_request)
2071 sysfs_create_group(&mddev->kobj, &md_redundancy_group);
2072 else if (mddev->ro == 2) /* auto-readonly not meaningful */
2073 mddev->ro = 0;
2074
1733 atomic_set(&mddev->writes_pending,0); 2075 atomic_set(&mddev->writes_pending,0);
1734 mddev->safemode = 0; 2076 mddev->safemode = 0;
1735 mddev->safemode_timer.function = md_safemode_timeout; 2077 mddev->safemode_timer.function = md_safemode_timeout;
1736 mddev->safemode_timer.data = (unsigned long) mddev; 2078 mddev->safemode_timer.data = (unsigned long) mddev;
1737 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 2079 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
1738 mddev->in_sync = 1; 2080 mddev->in_sync = 1;
2081
2082 ITERATE_RDEV(mddev,rdev,tmp)
2083 if (rdev->raid_disk >= 0) {
2084 char nm[20];
2085 sprintf(nm, "rd%d", rdev->raid_disk);
2086 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
2087 }
1739 2088
1740 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2089 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1741 md_wakeup_thread(mddev->thread); 2090 md_wakeup_thread(mddev->thread);
@@ -1821,16 +2170,19 @@ static int do_md_stop(mddev_t * mddev, int ro)
1821 2170
1822 if (ro) { 2171 if (ro) {
1823 err = -ENXIO; 2172 err = -ENXIO;
1824 if (mddev->ro) 2173 if (mddev->ro==1)
1825 goto out; 2174 goto out;
1826 mddev->ro = 1; 2175 mddev->ro = 1;
1827 } else { 2176 } else {
1828 bitmap_flush(mddev); 2177 bitmap_flush(mddev);
1829 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 2178 md_super_wait(mddev);
1830 if (mddev->ro) 2179 if (mddev->ro)
1831 set_disk_ro(disk, 0); 2180 set_disk_ro(disk, 0);
1832 blk_queue_make_request(mddev->queue, md_fail_request); 2181 blk_queue_make_request(mddev->queue, md_fail_request);
1833 mddev->pers->stop(mddev); 2182 mddev->pers->stop(mddev);
2183 if (mddev->pers->sync_request)
2184 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
2185
1834 module_put(mddev->pers->owner); 2186 module_put(mddev->pers->owner);
1835 mddev->pers = NULL; 2187 mddev->pers = NULL;
1836 if (mddev->ro) 2188 if (mddev->ro)
@@ -1857,9 +2209,18 @@ static int do_md_stop(mddev_t * mddev, int ro)
1857 * Free resources if final stop 2209 * Free resources if final stop
1858 */ 2210 */
1859 if (!ro) { 2211 if (!ro) {
2212 mdk_rdev_t *rdev;
2213 struct list_head *tmp;
1860 struct gendisk *disk; 2214 struct gendisk *disk;
1861 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 2215 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
1862 2216
2217 ITERATE_RDEV(mddev,rdev,tmp)
2218 if (rdev->raid_disk >= 0) {
2219 char nm[20];
2220 sprintf(nm, "rd%d", rdev->raid_disk);
2221 sysfs_remove_link(&mddev->kobj, nm);
2222 }
2223
1863 export_array(mddev); 2224 export_array(mddev);
1864 2225
1865 mddev->array_size = 0; 2226 mddev->array_size = 0;
@@ -2012,7 +2373,7 @@ static int autostart_array(dev_t startdev)
2012 return err; 2373 return err;
2013 } 2374 }
2014 2375
2015 if (start_rdev->faulty) { 2376 if (test_bit(Faulty, &start_rdev->flags)) {
2016 printk(KERN_WARNING 2377 printk(KERN_WARNING
2017 "md: can not autostart based on faulty %s!\n", 2378 "md: can not autostart based on faulty %s!\n",
2018 bdevname(start_rdev->bdev,b)); 2379 bdevname(start_rdev->bdev,b));
@@ -2071,11 +2432,11 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
2071 nr=working=active=failed=spare=0; 2432 nr=working=active=failed=spare=0;
2072 ITERATE_RDEV(mddev,rdev,tmp) { 2433 ITERATE_RDEV(mddev,rdev,tmp) {
2073 nr++; 2434 nr++;
2074 if (rdev->faulty) 2435 if (test_bit(Faulty, &rdev->flags))
2075 failed++; 2436 failed++;
2076 else { 2437 else {
2077 working++; 2438 working++;
2078 if (rdev->in_sync) 2439 if (test_bit(In_sync, &rdev->flags))
2079 active++; 2440 active++;
2080 else 2441 else
2081 spare++; 2442 spare++;
@@ -2166,9 +2527,9 @@ static int get_disk_info(mddev_t * mddev, void __user * arg)
2166 info.minor = MINOR(rdev->bdev->bd_dev); 2527 info.minor = MINOR(rdev->bdev->bd_dev);
2167 info.raid_disk = rdev->raid_disk; 2528 info.raid_disk = rdev->raid_disk;
2168 info.state = 0; 2529 info.state = 0;
2169 if (rdev->faulty) 2530 if (test_bit(Faulty, &rdev->flags))
2170 info.state |= (1<<MD_DISK_FAULTY); 2531 info.state |= (1<<MD_DISK_FAULTY);
2171 else if (rdev->in_sync) { 2532 else if (test_bit(In_sync, &rdev->flags)) {
2172 info.state |= (1<<MD_DISK_ACTIVE); 2533 info.state |= (1<<MD_DISK_ACTIVE);
2173 info.state |= (1<<MD_DISK_SYNC); 2534 info.state |= (1<<MD_DISK_SYNC);
2174 } 2535 }
@@ -2261,7 +2622,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2261 validate_super(mddev, rdev); 2622 validate_super(mddev, rdev);
2262 rdev->saved_raid_disk = rdev->raid_disk; 2623 rdev->saved_raid_disk = rdev->raid_disk;
2263 2624
2264 rdev->in_sync = 0; /* just to be sure */ 2625 clear_bit(In_sync, &rdev->flags); /* just to be sure */
2265 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2626 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2266 set_bit(WriteMostly, &rdev->flags); 2627 set_bit(WriteMostly, &rdev->flags);
2267 2628
@@ -2299,11 +2660,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2299 else 2660 else
2300 rdev->raid_disk = -1; 2661 rdev->raid_disk = -1;
2301 2662
2302 rdev->faulty = 0; 2663 rdev->flags = 0;
2664
2303 if (rdev->raid_disk < mddev->raid_disks) 2665 if (rdev->raid_disk < mddev->raid_disks)
2304 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); 2666 if (info->state & (1<<MD_DISK_SYNC))
2305 else 2667 set_bit(In_sync, &rdev->flags);
2306 rdev->in_sync = 0;
2307 2668
2308 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2669 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2309 set_bit(WriteMostly, &rdev->flags); 2670 set_bit(WriteMostly, &rdev->flags);
@@ -2402,14 +2763,14 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2402 goto abort_export; 2763 goto abort_export;
2403 } 2764 }
2404 2765
2405 if (rdev->faulty) { 2766 if (test_bit(Faulty, &rdev->flags)) {
2406 printk(KERN_WARNING 2767 printk(KERN_WARNING
2407 "md: can not hot-add faulty %s disk to %s!\n", 2768 "md: can not hot-add faulty %s disk to %s!\n",
2408 bdevname(rdev->bdev,b), mdname(mddev)); 2769 bdevname(rdev->bdev,b), mdname(mddev));
2409 err = -EINVAL; 2770 err = -EINVAL;
2410 goto abort_export; 2771 goto abort_export;
2411 } 2772 }
2412 rdev->in_sync = 0; 2773 clear_bit(In_sync, &rdev->flags);
2413 rdev->desc_nr = -1; 2774 rdev->desc_nr = -1;
2414 bind_rdev_to_array(rdev, mddev); 2775 bind_rdev_to_array(rdev, mddev);
2415 2776
@@ -2929,12 +3290,22 @@ static int md_ioctl(struct inode *inode, struct file *file,
2929 3290
2930 /* 3291 /*
2931 * The remaining ioctls are changing the state of the 3292 * The remaining ioctls are changing the state of the
2932 * superblock, so we do not allow read-only arrays 3293 * superblock, so we do not allow them on read-only arrays.
2933 * here: 3294 * However non-MD ioctls (e.g. get-size) will still come through
3295 * here and hit the 'default' below, so only disallow
3296 * 'md' ioctls, and switch to rw mode if started auto-readonly.
2934 */ 3297 */
2935 if (mddev->ro) { 3298 if (_IOC_TYPE(cmd) == MD_MAJOR &&
2936 err = -EROFS; 3299 mddev->ro && mddev->pers) {
2937 goto abort_unlock; 3300 if (mddev->ro == 2) {
3301 mddev->ro = 0;
3302 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3303 md_wakeup_thread(mddev->thread);
3304
3305 } else {
3306 err = -EROFS;
3307 goto abort_unlock;
3308 }
2938 } 3309 }
2939 3310
2940 switch (cmd) 3311 switch (cmd)
@@ -3064,21 +3435,17 @@ static int md_thread(void * arg)
3064 */ 3435 */
3065 3436
3066 allow_signal(SIGKILL); 3437 allow_signal(SIGKILL);
3067 complete(thread->event);
3068 while (!kthread_should_stop()) { 3438 while (!kthread_should_stop()) {
3069 void (*run)(mddev_t *);
3070 3439
3071 wait_event_interruptible_timeout(thread->wqueue, 3440 wait_event_timeout(thread->wqueue,
3072 test_bit(THREAD_WAKEUP, &thread->flags) 3441 test_bit(THREAD_WAKEUP, &thread->flags)
3073 || kthread_should_stop(), 3442 || kthread_should_stop(),
3074 thread->timeout); 3443 thread->timeout);
3075 try_to_freeze(); 3444 try_to_freeze();
3076 3445
3077 clear_bit(THREAD_WAKEUP, &thread->flags); 3446 clear_bit(THREAD_WAKEUP, &thread->flags);
3078 3447
3079 run = thread->run; 3448 thread->run(thread->mddev);
3080 if (run)
3081 run(thread->mddev);
3082 } 3449 }
3083 3450
3084 return 0; 3451 return 0;
@@ -3097,7 +3464,6 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3097 const char *name) 3464 const char *name)
3098{ 3465{
3099 mdk_thread_t *thread; 3466 mdk_thread_t *thread;
3100 struct completion event;
3101 3467
3102 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3468 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL);
3103 if (!thread) 3469 if (!thread)
@@ -3106,18 +3472,14 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3106 memset(thread, 0, sizeof(mdk_thread_t)); 3472 memset(thread, 0, sizeof(mdk_thread_t));
3107 init_waitqueue_head(&thread->wqueue); 3473 init_waitqueue_head(&thread->wqueue);
3108 3474
3109 init_completion(&event);
3110 thread->event = &event;
3111 thread->run = run; 3475 thread->run = run;
3112 thread->mddev = mddev; 3476 thread->mddev = mddev;
3113 thread->name = name;
3114 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3477 thread->timeout = MAX_SCHEDULE_TIMEOUT;
3115 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 3478 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
3116 if (IS_ERR(thread->tsk)) { 3479 if (IS_ERR(thread->tsk)) {
3117 kfree(thread); 3480 kfree(thread);
3118 return NULL; 3481 return NULL;
3119 } 3482 }
3120 wait_for_completion(&event);
3121 return thread; 3483 return thread;
3122} 3484}
3123 3485
@@ -3136,7 +3498,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
3136 return; 3498 return;
3137 } 3499 }
3138 3500
3139 if (!rdev || rdev->faulty) 3501 if (!rdev || test_bit(Faulty, &rdev->flags))
3140 return; 3502 return;
3141/* 3503/*
3142 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3504 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
@@ -3322,8 +3684,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
3322 seq_printf(seq, "%s : %sactive", mdname(mddev), 3684 seq_printf(seq, "%s : %sactive", mdname(mddev),
3323 mddev->pers ? "" : "in"); 3685 mddev->pers ? "" : "in");
3324 if (mddev->pers) { 3686 if (mddev->pers) {
3325 if (mddev->ro) 3687 if (mddev->ro==1)
3326 seq_printf(seq, " (read-only)"); 3688 seq_printf(seq, " (read-only)");
3689 if (mddev->ro==2)
3690 seq_printf(seq, "(auto-read-only)");
3327 seq_printf(seq, " %s", mddev->pers->name); 3691 seq_printf(seq, " %s", mddev->pers->name);
3328 } 3692 }
3329 3693
@@ -3334,7 +3698,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
3334 bdevname(rdev->bdev,b), rdev->desc_nr); 3698 bdevname(rdev->bdev,b), rdev->desc_nr);
3335 if (test_bit(WriteMostly, &rdev->flags)) 3699 if (test_bit(WriteMostly, &rdev->flags))
3336 seq_printf(seq, "(W)"); 3700 seq_printf(seq, "(W)");
3337 if (rdev->faulty) { 3701 if (test_bit(Faulty, &rdev->flags)) {
3338 seq_printf(seq, "(F)"); 3702 seq_printf(seq, "(F)");
3339 continue; 3703 continue;
3340 } else if (rdev->raid_disk < 0) 3704 } else if (rdev->raid_disk < 0)
@@ -3363,11 +3727,15 @@ static int md_seq_show(struct seq_file *seq, void *v)
3363 if (mddev->pers) { 3727 if (mddev->pers) {
3364 mddev->pers->status (seq, mddev); 3728 mddev->pers->status (seq, mddev);
3365 seq_printf(seq, "\n "); 3729 seq_printf(seq, "\n ");
3366 if (mddev->curr_resync > 2) { 3730 if (mddev->pers->sync_request) {
3367 status_resync (seq, mddev); 3731 if (mddev->curr_resync > 2) {
3368 seq_printf(seq, "\n "); 3732 status_resync (seq, mddev);
3369 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3733 seq_printf(seq, "\n ");
3370 seq_printf(seq, " resync=DELAYED\n "); 3734 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3735 seq_printf(seq, "\tresync=DELAYED\n ");
3736 else if (mddev->recovery_cp < MaxSector)
3737 seq_printf(seq, "\tresync=PENDING\n ");
3738 }
3371 } else 3739 } else
3372 seq_printf(seq, "\n "); 3740 seq_printf(seq, "\n ");
3373 3741
@@ -3504,15 +3872,22 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
3504 if (bio_data_dir(bi) != WRITE) 3872 if (bio_data_dir(bi) != WRITE)
3505 return; 3873 return;
3506 3874
3875 BUG_ON(mddev->ro == 1);
3876 if (mddev->ro == 2) {
3877 /* need to switch to read/write */
3878 mddev->ro = 0;
3879 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3880 md_wakeup_thread(mddev->thread);
3881 }
3507 atomic_inc(&mddev->writes_pending); 3882 atomic_inc(&mddev->writes_pending);
3508 if (mddev->in_sync) { 3883 if (mddev->in_sync) {
3509 spin_lock(&mddev->write_lock); 3884 spin_lock_irq(&mddev->write_lock);
3510 if (mddev->in_sync) { 3885 if (mddev->in_sync) {
3511 mddev->in_sync = 0; 3886 mddev->in_sync = 0;
3512 mddev->sb_dirty = 1; 3887 mddev->sb_dirty = 1;
3513 md_wakeup_thread(mddev->thread); 3888 md_wakeup_thread(mddev->thread);
3514 } 3889 }
3515 spin_unlock(&mddev->write_lock); 3890 spin_unlock_irq(&mddev->write_lock);
3516 } 3891 }
3517 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3892 wait_event(mddev->sb_wait, mddev->sb_dirty==0);
3518} 3893}
@@ -3568,9 +3943,7 @@ static void md_do_sync(mddev_t *mddev)
3568 mddev->curr_resync = 2; 3943 mddev->curr_resync = 2;
3569 3944
3570 try_again: 3945 try_again:
3571 if (signal_pending(current) || 3946 if (kthread_should_stop()) {
3572 kthread_should_stop()) {
3573 flush_signals(current);
3574 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3947 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3575 goto skip; 3948 goto skip;
3576 } 3949 }
@@ -3590,9 +3963,8 @@ static void md_do_sync(mddev_t *mddev)
3590 * time 'round when curr_resync == 2 3963 * time 'round when curr_resync == 2
3591 */ 3964 */
3592 continue; 3965 continue;
3593 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 3966 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
3594 if (!signal_pending(current) && 3967 if (!kthread_should_stop() &&
3595 !kthread_should_stop() &&
3596 mddev2->curr_resync >= mddev->curr_resync) { 3968 mddev2->curr_resync >= mddev->curr_resync) {
3597 printk(KERN_INFO "md: delaying resync of %s" 3969 printk(KERN_INFO "md: delaying resync of %s"
3598 " until %s has finished resync (they" 3970 " until %s has finished resync (they"
@@ -3608,12 +3980,13 @@ static void md_do_sync(mddev_t *mddev)
3608 } 3980 }
3609 } while (mddev->curr_resync < 2); 3981 } while (mddev->curr_resync < 2);
3610 3982
3611 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3983 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3612 /* resync follows the size requested by the personality, 3984 /* resync follows the size requested by the personality,
3613 * which defaults to physical size, but can be virtual size 3985 * which defaults to physical size, but can be virtual size
3614 */ 3986 */
3615 max_sectors = mddev->resync_max_sectors; 3987 max_sectors = mddev->resync_max_sectors;
3616 else 3988 mddev->resync_mismatches = 0;
3989 } else
3617 /* recovery follows the physical size of devices */ 3990 /* recovery follows the physical size of devices */
3618 max_sectors = mddev->size << 1; 3991 max_sectors = mddev->size << 1;
3619 3992
@@ -3626,7 +3999,8 @@ static void md_do_sync(mddev_t *mddev)
3626 3999
3627 is_mddev_idle(mddev); /* this also initializes IO event counters */ 4000 is_mddev_idle(mddev); /* this also initializes IO event counters */
3628 /* we don't use the checkpoint if there's a bitmap */ 4001 /* we don't use the checkpoint if there's a bitmap */
3629 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap) 4002 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
4003 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3630 j = mddev->recovery_cp; 4004 j = mddev->recovery_cp;
3631 else 4005 else
3632 j = 0; 4006 j = 0;
@@ -3699,13 +4073,12 @@ static void md_do_sync(mddev_t *mddev)
3699 } 4073 }
3700 4074
3701 4075
3702 if (signal_pending(current) || kthread_should_stop()) { 4076 if (kthread_should_stop()) {
3703 /* 4077 /*
3704 * got a signal, exit. 4078 * got a signal, exit.
3705 */ 4079 */
3706 printk(KERN_INFO 4080 printk(KERN_INFO
3707 "md: md_do_sync() got signal ... exiting\n"); 4081 "md: md_do_sync() got signal ... exiting\n");
3708 flush_signals(current);
3709 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4082 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3710 goto out; 4083 goto out;
3711 } 4084 }
@@ -3727,7 +4100,7 @@ static void md_do_sync(mddev_t *mddev)
3727 if (currspeed > sysctl_speed_limit_min) { 4100 if (currspeed > sysctl_speed_limit_min) {
3728 if ((currspeed > sysctl_speed_limit_max) || 4101 if ((currspeed > sysctl_speed_limit_max) ||
3729 !is_mddev_idle(mddev)) { 4102 !is_mddev_idle(mddev)) {
3730 msleep_interruptible(250); 4103 msleep(250);
3731 goto repeat; 4104 goto repeat;
3732 } 4105 }
3733 } 4106 }
@@ -3820,7 +4193,7 @@ void md_check_recovery(mddev_t *mddev)
3820 if (mddev_trylock(mddev)==0) { 4193 if (mddev_trylock(mddev)==0) {
3821 int spares =0; 4194 int spares =0;
3822 4195
3823 spin_lock(&mddev->write_lock); 4196 spin_lock_irq(&mddev->write_lock);
3824 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4197 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3825 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4198 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3826 mddev->in_sync = 1; 4199 mddev->in_sync = 1;
@@ -3828,7 +4201,7 @@ void md_check_recovery(mddev_t *mddev)
3828 } 4201 }
3829 if (mddev->safemode == 1) 4202 if (mddev->safemode == 1)
3830 mddev->safemode = 0; 4203 mddev->safemode = 0;
3831 spin_unlock(&mddev->write_lock); 4204 spin_unlock_irq(&mddev->write_lock);
3832 4205
3833 if (mddev->sb_dirty) 4206 if (mddev->sb_dirty)
3834 md_update_sb(mddev); 4207 md_update_sb(mddev);
@@ -3864,9 +4237,13 @@ void md_check_recovery(mddev_t *mddev)
3864 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4237 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3865 goto unlock; 4238 goto unlock;
3866 } 4239 }
3867 if (mddev->recovery) 4240 /* Clear some bits that don't mean anything, but
3868 /* probably just the RECOVERY_NEEDED flag */ 4241 * might be left set
3869 mddev->recovery = 0; 4242 */
4243 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4244 clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
4245 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
4246 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
3870 4247
3871 /* no recovery is running. 4248 /* no recovery is running.
3872 * remove any failed drives, then 4249 * remove any failed drives, then
@@ -3876,31 +4253,41 @@ void md_check_recovery(mddev_t *mddev)
3876 */ 4253 */
3877 ITERATE_RDEV(mddev,rdev,rtmp) 4254 ITERATE_RDEV(mddev,rdev,rtmp)
3878 if (rdev->raid_disk >= 0 && 4255 if (rdev->raid_disk >= 0 &&
3879 (rdev->faulty || ! rdev->in_sync) && 4256 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
3880 atomic_read(&rdev->nr_pending)==0) { 4257 atomic_read(&rdev->nr_pending)==0) {
3881 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) 4258 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
4259 char nm[20];
4260 sprintf(nm,"rd%d", rdev->raid_disk);
4261 sysfs_remove_link(&mddev->kobj, nm);
3882 rdev->raid_disk = -1; 4262 rdev->raid_disk = -1;
4263 }
3883 } 4264 }
3884 4265
3885 if (mddev->degraded) { 4266 if (mddev->degraded) {
3886 ITERATE_RDEV(mddev,rdev,rtmp) 4267 ITERATE_RDEV(mddev,rdev,rtmp)
3887 if (rdev->raid_disk < 0 4268 if (rdev->raid_disk < 0
3888 && !rdev->faulty) { 4269 && !test_bit(Faulty, &rdev->flags)) {
3889 if (mddev->pers->hot_add_disk(mddev,rdev)) 4270 if (mddev->pers->hot_add_disk(mddev,rdev)) {
4271 char nm[20];
4272 sprintf(nm, "rd%d", rdev->raid_disk);
4273 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
3890 spares++; 4274 spares++;
3891 else 4275 } else
3892 break; 4276 break;
3893 } 4277 }
3894 } 4278 }
3895 4279
3896 if (!spares && (mddev->recovery_cp == MaxSector )) { 4280 if (spares) {
3897 /* nothing we can do ... */ 4281 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4282 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4283 } else if (mddev->recovery_cp < MaxSector) {
4284 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4285 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4286 /* nothing to be done ... */
3898 goto unlock; 4287 goto unlock;
3899 } 4288
3900 if (mddev->pers->sync_request) { 4289 if (mddev->pers->sync_request) {
3901 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4290 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3902 if (!spares)
3903 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3904 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 4291 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
3905 /* We are adding a device or devices to an array 4292 /* We are adding a device or devices to an array
3906 * which has the bitmap stored on all devices. 4293 * which has the bitmap stored on all devices.
@@ -3975,7 +4362,7 @@ static int __init md_init(void)
3975 " MD_SB_DISKS=%d\n", 4362 " MD_SB_DISKS=%d\n",
3976 MD_MAJOR_VERSION, MD_MINOR_VERSION, 4363 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3977 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 4364 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3978 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, 4365 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
3979 BITMAP_MINOR); 4366 BITMAP_MINOR);
3980 4367
3981 if (register_blkdev(MAJOR_NR, "md")) 4368 if (register_blkdev(MAJOR_NR, "md"))
@@ -4039,7 +4426,7 @@ static void autostart_arrays(int part)
4039 if (IS_ERR(rdev)) 4426 if (IS_ERR(rdev))
4040 continue; 4427 continue;
4041 4428
4042 if (rdev->faulty) { 4429 if (test_bit(Faulty, &rdev->flags)) {
4043 MD_BUG(); 4430 MD_BUG();
4044 continue; 4431 continue;
4045 } 4432 }
@@ -4086,6 +4473,23 @@ static __exit void md_exit(void)
4086module_init(md_init) 4473module_init(md_init)
4087module_exit(md_exit) 4474module_exit(md_exit)
4088 4475
4476static int get_ro(char *buffer, struct kernel_param *kp)
4477{
4478 return sprintf(buffer, "%d", start_readonly);
4479}
4480static int set_ro(const char *val, struct kernel_param *kp)
4481{
4482 char *e;
4483 int num = simple_strtoul(val, &e, 10);
4484 if (*val && (*e == '\0' || *e == '\n')) {
4485 start_readonly = num;
4486 return 0;;
4487 }
4488 return -EINVAL;
4489}
4490
4491module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
4492
4089EXPORT_SYMBOL(register_md_personality); 4493EXPORT_SYMBOL(register_md_personality);
4090EXPORT_SYMBOL(unregister_md_personality); 4494EXPORT_SYMBOL(unregister_md_personality);
4091EXPORT_SYMBOL(md_error); 4495EXPORT_SYMBOL(md_error);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index c06f4474192b..145cdc5ad008 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -63,8 +63,8 @@ static int multipath_map (multipath_conf_t *conf)
63 63
64 rcu_read_lock(); 64 rcu_read_lock();
65 for (i = 0; i < disks; i++) { 65 for (i = 0; i < disks; i++) {
66 mdk_rdev_t *rdev = conf->multipaths[i].rdev; 66 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
67 if (rdev && rdev->in_sync) { 67 if (rdev && test_bit(In_sync, &rdev->flags)) {
68 atomic_inc(&rdev->nr_pending); 68 atomic_inc(&rdev->nr_pending);
69 rcu_read_unlock(); 69 rcu_read_unlock();
70 return i; 70 return i;
@@ -139,8 +139,9 @@ static void unplug_slaves(mddev_t *mddev)
139 139
140 rcu_read_lock(); 140 rcu_read_lock();
141 for (i=0; i<mddev->raid_disks; i++) { 141 for (i=0; i<mddev->raid_disks; i++) {
142 mdk_rdev_t *rdev = conf->multipaths[i].rdev; 142 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
143 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { 143 if (rdev && !test_bit(Faulty, &rdev->flags)
144 && atomic_read(&rdev->nr_pending)) {
144 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 145 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
145 146
146 atomic_inc(&rdev->nr_pending); 147 atomic_inc(&rdev->nr_pending);
@@ -211,7 +212,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
211 for (i = 0; i < conf->raid_disks; i++) 212 for (i = 0; i < conf->raid_disks; i++)
212 seq_printf (seq, "%s", 213 seq_printf (seq, "%s",
213 conf->multipaths[i].rdev && 214 conf->multipaths[i].rdev &&
214 conf->multipaths[i].rdev->in_sync ? "U" : "_"); 215 test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_");
215 seq_printf (seq, "]"); 216 seq_printf (seq, "]");
216} 217}
217 218
@@ -224,8 +225,8 @@ static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
224 225
225 rcu_read_lock(); 226 rcu_read_lock();
226 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 227 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
227 mdk_rdev_t *rdev = conf->multipaths[i].rdev; 228 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
228 if (rdev && !rdev->faulty) { 229 if (rdev && !test_bit(Faulty, &rdev->flags)) {
229 struct block_device *bdev = rdev->bdev; 230 struct block_device *bdev = rdev->bdev;
230 request_queue_t *r_queue = bdev_get_queue(bdev); 231 request_queue_t *r_queue = bdev_get_queue(bdev);
231 232
@@ -265,10 +266,10 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
265 /* 266 /*
266 * Mark disk as unusable 267 * Mark disk as unusable
267 */ 268 */
268 if (!rdev->faulty) { 269 if (!test_bit(Faulty, &rdev->flags)) {
269 char b[BDEVNAME_SIZE]; 270 char b[BDEVNAME_SIZE];
270 rdev->in_sync = 0; 271 clear_bit(In_sync, &rdev->flags);
271 rdev->faulty = 1; 272 set_bit(Faulty, &rdev->flags);
272 mddev->sb_dirty = 1; 273 mddev->sb_dirty = 1;
273 conf->working_disks--; 274 conf->working_disks--;
274 printk(KERN_ALERT "multipath: IO failure on %s," 275 printk(KERN_ALERT "multipath: IO failure on %s,"
@@ -298,7 +299,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
298 tmp = conf->multipaths + i; 299 tmp = conf->multipaths + i;
299 if (tmp->rdev) 300 if (tmp->rdev)
300 printk(" disk%d, o:%d, dev:%s\n", 301 printk(" disk%d, o:%d, dev:%s\n",
301 i,!tmp->rdev->faulty, 302 i,!test_bit(Faulty, &tmp->rdev->flags),
302 bdevname(tmp->rdev->bdev,b)); 303 bdevname(tmp->rdev->bdev,b));
303 } 304 }
304} 305}
@@ -330,8 +331,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
330 331
331 conf->working_disks++; 332 conf->working_disks++;
332 rdev->raid_disk = path; 333 rdev->raid_disk = path;
333 rdev->in_sync = 1; 334 set_bit(In_sync, &rdev->flags);
334 p->rdev = rdev; 335 rcu_assign_pointer(p->rdev, rdev);
335 found = 1; 336 found = 1;
336 } 337 }
337 338
@@ -350,7 +351,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
350 351
351 rdev = p->rdev; 352 rdev = p->rdev;
352 if (rdev) { 353 if (rdev) {
353 if (rdev->in_sync || 354 if (test_bit(In_sync, &rdev->flags) ||
354 atomic_read(&rdev->nr_pending)) { 355 atomic_read(&rdev->nr_pending)) {
355 printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); 356 printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number);
356 err = -EBUSY; 357 err = -EBUSY;
@@ -482,7 +483,7 @@ static int multipath_run (mddev_t *mddev)
482 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 483 mddev->queue->max_sectors > (PAGE_SIZE>>9))
483 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 484 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
484 485
485 if (!rdev->faulty) 486 if (!test_bit(Faulty, &rdev->flags))
486 conf->working_disks++; 487 conf->working_disks++;
487 } 488 }
488 489
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e16f473bcf46..2da9d3ba902d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
301{ 301{
302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
304 int mirror, behind; 304 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
305 conf_t *conf = mddev_to_conf(r1_bio->mddev); 305 conf_t *conf = mddev_to_conf(r1_bio->mddev);
306 306
307 if (bio->bi_size) 307 if (bio->bi_size)
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
311 if (r1_bio->bios[mirror] == bio) 311 if (r1_bio->bios[mirror] == bio)
312 break; 312 break;
313 313
314 /* 314 if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
315 * this branch is our 'one mirror IO has finished' event handler: 315 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
316 */ 316 set_bit(R1BIO_BarrierRetry, &r1_bio->state);
317 if (!uptodate) { 317 r1_bio->mddev->barriers_work = 0;
318 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 318 } else {
319 /* an I/O failed, we can't clear the bitmap */
320 set_bit(R1BIO_Degraded, &r1_bio->state);
321 } else
322 /* 319 /*
323 * Set R1BIO_Uptodate in our master bio, so that 320 * this branch is our 'one mirror IO has finished' event handler:
324 * we will return a good error code for to the higher
325 * levels even if IO on some other mirrored buffer fails.
326 *
327 * The 'master' represents the composite IO operation to
328 * user-side. So if something waits for IO, then it will
329 * wait for the 'master' bio.
330 */ 321 */
331 set_bit(R1BIO_Uptodate, &r1_bio->state); 322 r1_bio->bios[mirror] = NULL;
332 323 bio_put(bio);
333 update_head_pos(mirror, r1_bio); 324 if (!uptodate) {
334 325 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 326 /* an I/O failed, we can't clear the bitmap */
336 if (behind) { 327 set_bit(R1BIO_Degraded, &r1_bio->state);
337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 328 } else
338 atomic_dec(&r1_bio->behind_remaining); 329 /*
339 330 * Set R1BIO_Uptodate in our master bio, so that
340 /* In behind mode, we ACK the master bio once the I/O has safely 331 * we will return a good error code for to the higher
341 * reached all non-writemostly disks. Setting the Returned bit 332 * levels even if IO on some other mirrored buffer fails.
342 * ensures that this gets done only once -- we don't ever want to 333 *
343 * return -EIO here, instead we'll wait */ 334 * The 'master' represents the composite IO operation to
344 335 * user-side. So if something waits for IO, then it will
345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && 336 * wait for the 'master' bio.
346 test_bit(R1BIO_Uptodate, &r1_bio->state)) { 337 */
347 /* Maybe we can return now */ 338 set_bit(R1BIO_Uptodate, &r1_bio->state);
348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 339
349 struct bio *mbio = r1_bio->master_bio; 340 update_head_pos(mirror, r1_bio);
350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", 341
351 (unsigned long long) mbio->bi_sector, 342 if (behind) {
352 (unsigned long long) mbio->bi_sector + 343 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
353 (mbio->bi_size >> 9) - 1); 344 atomic_dec(&r1_bio->behind_remaining);
354 bio_endio(mbio, mbio->bi_size, 0); 345
346 /* In behind mode, we ACK the master bio once the I/O has safely
347 * reached all non-writemostly disks. Setting the Returned bit
348 * ensures that this gets done only once -- we don't ever want to
349 * return -EIO here, instead we'll wait */
350
351 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
352 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
353 /* Maybe we can return now */
354 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
355 struct bio *mbio = r1_bio->master_bio;
356 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
357 (unsigned long long) mbio->bi_sector,
358 (unsigned long long) mbio->bi_sector +
359 (mbio->bi_size >> 9) - 1);
360 bio_endio(mbio, mbio->bi_size, 0);
361 }
355 } 362 }
356 } 363 }
357 } 364 }
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
361 * already. 368 * already.
362 */ 369 */
363 if (atomic_dec_and_test(&r1_bio->remaining)) { 370 if (atomic_dec_and_test(&r1_bio->remaining)) {
371 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
372 reschedule_retry(r1_bio);
373 /* Don't dec_pending yet, we want to hold
374 * the reference over the retry
375 */
376 return 0;
377 }
364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 378 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
365 /* free extra copy of the data pages */ 379 /* free extra copy of the data pages */
380/* FIXME bio has been freed!!! */
366 int i = bio->bi_vcnt; 381 int i = bio->bi_vcnt;
367 while (i--) 382 while (i--)
368 __free_page(bio->bi_io_vec[i].bv_page); 383 __free_page(bio->bi_io_vec[i].bv_page);
@@ -416,12 +431,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
416 /* Choose the first operation device, for consistancy */ 431 /* Choose the first operation device, for consistancy */
417 new_disk = 0; 432 new_disk = 0;
418 433
419 for (rdev = conf->mirrors[new_disk].rdev; 434 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
420 !rdev || !rdev->in_sync 435 !rdev || !test_bit(In_sync, &rdev->flags)
421 || test_bit(WriteMostly, &rdev->flags); 436 || test_bit(WriteMostly, &rdev->flags);
422 rdev = conf->mirrors[++new_disk].rdev) { 437 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
423 438
424 if (rdev && rdev->in_sync) 439 if (rdev && test_bit(In_sync, &rdev->flags))
425 wonly_disk = new_disk; 440 wonly_disk = new_disk;
426 441
427 if (new_disk == conf->raid_disks - 1) { 442 if (new_disk == conf->raid_disks - 1) {
@@ -434,12 +449,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
434 449
435 450
436 /* make sure the disk is operational */ 451 /* make sure the disk is operational */
437 for (rdev = conf->mirrors[new_disk].rdev; 452 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
438 !rdev || !rdev->in_sync || 453 !rdev || !test_bit(In_sync, &rdev->flags) ||
439 test_bit(WriteMostly, &rdev->flags); 454 test_bit(WriteMostly, &rdev->flags);
440 rdev = conf->mirrors[new_disk].rdev) { 455 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
441 456
442 if (rdev && rdev->in_sync) 457 if (rdev && test_bit(In_sync, &rdev->flags))
443 wonly_disk = new_disk; 458 wonly_disk = new_disk;
444 459
445 if (new_disk <= 0) 460 if (new_disk <= 0)
@@ -474,10 +489,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
474 disk = conf->raid_disks; 489 disk = conf->raid_disks;
475 disk--; 490 disk--;
476 491
477 rdev = conf->mirrors[disk].rdev; 492 rdev = rcu_dereference(conf->mirrors[disk].rdev);
478 493
479 if (!rdev || 494 if (!rdev ||
480 !rdev->in_sync || 495 !test_bit(In_sync, &rdev->flags) ||
481 test_bit(WriteMostly, &rdev->flags)) 496 test_bit(WriteMostly, &rdev->flags))
482 continue; 497 continue;
483 498
@@ -496,11 +511,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
496 511
497 512
498 if (new_disk >= 0) { 513 if (new_disk >= 0) {
499 rdev = conf->mirrors[new_disk].rdev; 514 rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
500 if (!rdev) 515 if (!rdev)
501 goto retry; 516 goto retry;
502 atomic_inc(&rdev->nr_pending); 517 atomic_inc(&rdev->nr_pending);
503 if (!rdev->in_sync) { 518 if (!test_bit(In_sync, &rdev->flags)) {
504 /* cannot risk returning a device that failed 519 /* cannot risk returning a device that failed
505 * before we inc'ed nr_pending 520 * before we inc'ed nr_pending
506 */ 521 */
@@ -522,8 +537,8 @@ static void unplug_slaves(mddev_t *mddev)
522 537
523 rcu_read_lock(); 538 rcu_read_lock();
524 for (i=0; i<mddev->raid_disks; i++) { 539 for (i=0; i<mddev->raid_disks; i++) {
525 mdk_rdev_t *rdev = conf->mirrors[i].rdev; 540 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
526 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { 541 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
527 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 542 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
528 543
529 atomic_inc(&rdev->nr_pending); 544 atomic_inc(&rdev->nr_pending);
@@ -556,8 +571,8 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
556 571
557 rcu_read_lock(); 572 rcu_read_lock();
558 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 573 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
559 mdk_rdev_t *rdev = conf->mirrors[i].rdev; 574 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
560 if (rdev && !rdev->faulty) { 575 if (rdev && !test_bit(Faulty, &rdev->flags)) {
561 struct block_device *bdev = rdev->bdev; 576 struct block_device *bdev = rdev->bdev;
562 request_queue_t *r_queue = bdev_get_queue(bdev); 577 request_queue_t *r_queue = bdev_get_queue(bdev);
563 578
@@ -648,8 +663,9 @@ static int make_request(request_queue_t *q, struct bio * bio)
648 struct bio_list bl; 663 struct bio_list bl;
649 struct page **behind_pages = NULL; 664 struct page **behind_pages = NULL;
650 const int rw = bio_data_dir(bio); 665 const int rw = bio_data_dir(bio);
666 int do_barriers;
651 667
652 if (unlikely(bio_barrier(bio))) { 668 if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
653 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 669 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
654 return 0; 670 return 0;
655 } 671 }
@@ -728,10 +744,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
728#endif 744#endif
729 rcu_read_lock(); 745 rcu_read_lock();
730 for (i = 0; i < disks; i++) { 746 for (i = 0; i < disks; i++) {
731 if ((rdev=conf->mirrors[i].rdev) != NULL && 747 if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&
732 !rdev->faulty) { 748 !test_bit(Faulty, &rdev->flags)) {
733 atomic_inc(&rdev->nr_pending); 749 atomic_inc(&rdev->nr_pending);
734 if (rdev->faulty) { 750 if (test_bit(Faulty, &rdev->flags)) {
735 atomic_dec(&rdev->nr_pending); 751 atomic_dec(&rdev->nr_pending);
736 r1_bio->bios[i] = NULL; 752 r1_bio->bios[i] = NULL;
737 } else 753 } else
@@ -759,6 +775,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
759 atomic_set(&r1_bio->remaining, 0); 775 atomic_set(&r1_bio->remaining, 0);
760 atomic_set(&r1_bio->behind_remaining, 0); 776 atomic_set(&r1_bio->behind_remaining, 0);
761 777
778 do_barriers = bio->bi_rw & BIO_RW_BARRIER;
779 if (do_barriers)
780 set_bit(R1BIO_Barrier, &r1_bio->state);
781
762 bio_list_init(&bl); 782 bio_list_init(&bl);
763 for (i = 0; i < disks; i++) { 783 for (i = 0; i < disks; i++) {
764 struct bio *mbio; 784 struct bio *mbio;
@@ -771,7 +791,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
771 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 791 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
772 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 792 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
773 mbio->bi_end_io = raid1_end_write_request; 793 mbio->bi_end_io = raid1_end_write_request;
774 mbio->bi_rw = WRITE; 794 mbio->bi_rw = WRITE | do_barriers;
775 mbio->bi_private = r1_bio; 795 mbio->bi_private = r1_bio;
776 796
777 if (behind_pages) { 797 if (behind_pages) {
@@ -824,7 +844,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
824 for (i = 0; i < conf->raid_disks; i++) 844 for (i = 0; i < conf->raid_disks; i++)
825 seq_printf(seq, "%s", 845 seq_printf(seq, "%s",
826 conf->mirrors[i].rdev && 846 conf->mirrors[i].rdev &&
827 conf->mirrors[i].rdev->in_sync ? "U" : "_"); 847 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
828 seq_printf(seq, "]"); 848 seq_printf(seq, "]");
829} 849}
830 850
@@ -840,14 +860,14 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
840 * next level up know. 860 * next level up know.
841 * else mark the drive as failed 861 * else mark the drive as failed
842 */ 862 */
843 if (rdev->in_sync 863 if (test_bit(In_sync, &rdev->flags)
844 && conf->working_disks == 1) 864 && conf->working_disks == 1)
845 /* 865 /*
846 * Don't fail the drive, act as though we were just a 866 * Don't fail the drive, act as though we were just a
847 * normal single drive 867 * normal single drive
848 */ 868 */
849 return; 869 return;
850 if (rdev->in_sync) { 870 if (test_bit(In_sync, &rdev->flags)) {
851 mddev->degraded++; 871 mddev->degraded++;
852 conf->working_disks--; 872 conf->working_disks--;
853 /* 873 /*
@@ -855,8 +875,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
855 */ 875 */
856 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 876 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
857 } 877 }
858 rdev->in_sync = 0; 878 clear_bit(In_sync, &rdev->flags);
859 rdev->faulty = 1; 879 set_bit(Faulty, &rdev->flags);
860 mddev->sb_dirty = 1; 880 mddev->sb_dirty = 1;
861 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" 881 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
862 " Operation continuing on %d devices\n", 882 " Operation continuing on %d devices\n",
@@ -881,7 +901,7 @@ static void print_conf(conf_t *conf)
881 tmp = conf->mirrors + i; 901 tmp = conf->mirrors + i;
882 if (tmp->rdev) 902 if (tmp->rdev)
883 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 903 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
884 i, !tmp->rdev->in_sync, !tmp->rdev->faulty, 904 i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags),
885 bdevname(tmp->rdev->bdev,b)); 905 bdevname(tmp->rdev->bdev,b));
886 } 906 }
887} 907}
@@ -913,11 +933,11 @@ static int raid1_spare_active(mddev_t *mddev)
913 for (i = 0; i < conf->raid_disks; i++) { 933 for (i = 0; i < conf->raid_disks; i++) {
914 tmp = conf->mirrors + i; 934 tmp = conf->mirrors + i;
915 if (tmp->rdev 935 if (tmp->rdev
916 && !tmp->rdev->faulty 936 && !test_bit(Faulty, &tmp->rdev->flags)
917 && !tmp->rdev->in_sync) { 937 && !test_bit(In_sync, &tmp->rdev->flags)) {
918 conf->working_disks++; 938 conf->working_disks++;
919 mddev->degraded--; 939 mddev->degraded--;
920 tmp->rdev->in_sync = 1; 940 set_bit(In_sync, &tmp->rdev->flags);
921 } 941 }
922 } 942 }
923 943
@@ -954,7 +974,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
954 found = 1; 974 found = 1;
955 if (rdev->saved_raid_disk != mirror) 975 if (rdev->saved_raid_disk != mirror)
956 conf->fullsync = 1; 976 conf->fullsync = 1;
957 p->rdev = rdev; 977 rcu_assign_pointer(p->rdev, rdev);
958 break; 978 break;
959 } 979 }
960 980
@@ -972,7 +992,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
972 print_conf(conf); 992 print_conf(conf);
973 rdev = p->rdev; 993 rdev = p->rdev;
974 if (rdev) { 994 if (rdev) {
975 if (rdev->in_sync || 995 if (test_bit(In_sync, &rdev->flags) ||
976 atomic_read(&rdev->nr_pending)) { 996 atomic_read(&rdev->nr_pending)) {
977 err = -EBUSY; 997 err = -EBUSY;
978 goto abort; 998 goto abort;
@@ -1153,6 +1173,36 @@ static void raid1d(mddev_t *mddev)
1153 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1173 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1154 sync_request_write(mddev, r1_bio); 1174 sync_request_write(mddev, r1_bio);
1155 unplug = 1; 1175 unplug = 1;
1176 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1177 /* some requests in the r1bio were BIO_RW_BARRIER
1178 * requests which failed with -ENOTSUPP. Hohumm..
1179 * Better resubmit without the barrier.
1180 * We know which devices to resubmit for, because
1181 * all others have had their bios[] entry cleared.
1182 */
1183 int i;
1184 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1185 clear_bit(R1BIO_Barrier, &r1_bio->state);
1186 for (i=0; i < conf->raid_disks; i++)
1187 if (r1_bio->bios[i]) {
1188 struct bio_vec *bvec;
1189 int j;
1190
1191 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1192 /* copy pages from the failed bio, as
1193 * this might be a write-behind device */
1194 __bio_for_each_segment(bvec, bio, j, 0)
1195 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1196 bio_put(r1_bio->bios[i]);
1197 bio->bi_sector = r1_bio->sector +
1198 conf->mirrors[i].rdev->data_offset;
1199 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1200 bio->bi_end_io = raid1_end_write_request;
1201 bio->bi_rw = WRITE;
1202 bio->bi_private = r1_bio;
1203 r1_bio->bios[i] = bio;
1204 generic_make_request(bio);
1205 }
1156 } else { 1206 } else {
1157 int disk; 1207 int disk;
1158 bio = r1_bio->bios[r1_bio->read_disk]; 1208 bio = r1_bio->bios[r1_bio->read_disk];
@@ -1260,7 +1310,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1260 * This call the bitmap_start_sync doesn't actually record anything 1310 * This call the bitmap_start_sync doesn't actually record anything
1261 */ 1311 */
1262 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 1312 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1263 !conf->fullsync) { 1313 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1264 /* We can skip this block, and probably several more */ 1314 /* We can skip this block, and probably several more */
1265 *skipped = 1; 1315 *skipped = 1;
1266 return sync_blocks; 1316 return sync_blocks;
@@ -1282,11 +1332,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1282 /* make sure disk is operational */ 1332 /* make sure disk is operational */
1283 wonly = disk; 1333 wonly = disk;
1284 while (conf->mirrors[disk].rdev == NULL || 1334 while (conf->mirrors[disk].rdev == NULL ||
1285 !conf->mirrors[disk].rdev->in_sync || 1335 !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) ||
1286 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) 1336 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
1287 ) { 1337 ) {
1288 if (conf->mirrors[disk].rdev && 1338 if (conf->mirrors[disk].rdev &&
1289 conf->mirrors[disk].rdev->in_sync) 1339 test_bit(In_sync, &conf->mirrors[disk].rdev->flags))
1290 wonly = disk; 1340 wonly = disk;
1291 if (disk <= 0) 1341 if (disk <= 0)
1292 disk = conf->raid_disks; 1342 disk = conf->raid_disks;
@@ -1333,11 +1383,12 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1333 bio->bi_rw = READ; 1383 bio->bi_rw = READ;
1334 bio->bi_end_io = end_sync_read; 1384 bio->bi_end_io = end_sync_read;
1335 } else if (conf->mirrors[i].rdev == NULL || 1385 } else if (conf->mirrors[i].rdev == NULL ||
1336 conf->mirrors[i].rdev->faulty) { 1386 test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
1337 still_degraded = 1; 1387 still_degraded = 1;
1338 continue; 1388 continue;
1339 } else if (!conf->mirrors[i].rdev->in_sync || 1389 } else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
1340 sector_nr + RESYNC_SECTORS > mddev->recovery_cp) { 1390 sector_nr + RESYNC_SECTORS > mddev->recovery_cp ||
1391 test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1341 bio->bi_rw = WRITE; 1392 bio->bi_rw = WRITE;
1342 bio->bi_end_io = end_sync_write; 1393 bio->bi_end_io = end_sync_write;
1343 write_targets ++; 1394 write_targets ++;
@@ -1371,8 +1422,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1371 break; 1422 break;
1372 if (sync_blocks == 0) { 1423 if (sync_blocks == 0) {
1373 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 1424 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1374 &sync_blocks, still_degraded) && 1425 &sync_blocks, still_degraded) &&
1375 !conf->fullsync) 1426 !conf->fullsync &&
1427 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1376 break; 1428 break;
1377 if (sync_blocks < (PAGE_SIZE>>9)) 1429 if (sync_blocks < (PAGE_SIZE>>9))
1378 BUG(); 1430 BUG();
@@ -1478,7 +1530,7 @@ static int run(mddev_t *mddev)
1478 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1530 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1479 1531
1480 disk->head_position = 0; 1532 disk->head_position = 0;
1481 if (!rdev->faulty && rdev->in_sync) 1533 if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
1482 conf->working_disks++; 1534 conf->working_disks++;
1483 } 1535 }
1484 conf->raid_disks = mddev->raid_disks; 1536 conf->raid_disks = mddev->raid_disks;
@@ -1518,7 +1570,7 @@ static int run(mddev_t *mddev)
1518 */ 1570 */
1519 for (j = 0; j < conf->raid_disks && 1571 for (j = 0; j < conf->raid_disks &&
1520 (!conf->mirrors[j].rdev || 1572 (!conf->mirrors[j].rdev ||
1521 !conf->mirrors[j].rdev->in_sync) ; j++) 1573 !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)
1522 /* nothing */; 1574 /* nothing */;
1523 conf->last_used = j; 1575 conf->last_used = j;
1524 1576
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index bbe40e9cf923..867f06ae33d9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -496,6 +496,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
496 int disk, slot, nslot; 496 int disk, slot, nslot;
497 const int sectors = r10_bio->sectors; 497 const int sectors = r10_bio->sectors;
498 sector_t new_distance, current_distance; 498 sector_t new_distance, current_distance;
499 mdk_rdev_t *rdev;
499 500
500 raid10_find_phys(conf, r10_bio); 501 raid10_find_phys(conf, r10_bio);
501 rcu_read_lock(); 502 rcu_read_lock();
@@ -510,8 +511,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
510 slot = 0; 511 slot = 0;
511 disk = r10_bio->devs[slot].devnum; 512 disk = r10_bio->devs[slot].devnum;
512 513
513 while (!conf->mirrors[disk].rdev || 514 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
514 !conf->mirrors[disk].rdev->in_sync) { 515 !test_bit(In_sync, &rdev->flags)) {
515 slot++; 516 slot++;
516 if (slot == conf->copies) { 517 if (slot == conf->copies) {
517 slot = 0; 518 slot = 0;
@@ -527,8 +528,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
527 /* make sure the disk is operational */ 528 /* make sure the disk is operational */
528 slot = 0; 529 slot = 0;
529 disk = r10_bio->devs[slot].devnum; 530 disk = r10_bio->devs[slot].devnum;
530 while (!conf->mirrors[disk].rdev || 531 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
531 !conf->mirrors[disk].rdev->in_sync) { 532 !test_bit(In_sync, &rdev->flags)) {
532 slot ++; 533 slot ++;
533 if (slot == conf->copies) { 534 if (slot == conf->copies) {
534 disk = -1; 535 disk = -1;
@@ -547,11 +548,11 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
547 int ndisk = r10_bio->devs[nslot].devnum; 548 int ndisk = r10_bio->devs[nslot].devnum;
548 549
549 550
550 if (!conf->mirrors[ndisk].rdev || 551 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
551 !conf->mirrors[ndisk].rdev->in_sync) 552 !test_bit(In_sync, &rdev->flags))
552 continue; 553 continue;
553 554
554 if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) { 555 if (!atomic_read(&rdev->nr_pending)) {
555 disk = ndisk; 556 disk = ndisk;
556 slot = nslot; 557 slot = nslot;
557 break; 558 break;
@@ -569,7 +570,7 @@ rb_out:
569 r10_bio->read_slot = slot; 570 r10_bio->read_slot = slot;
570/* conf->next_seq_sect = this_sector + sectors;*/ 571/* conf->next_seq_sect = this_sector + sectors;*/
571 572
572 if (disk >= 0 && conf->mirrors[disk].rdev) 573 if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
573 atomic_inc(&conf->mirrors[disk].rdev->nr_pending); 574 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
574 rcu_read_unlock(); 575 rcu_read_unlock();
575 576
@@ -583,8 +584,8 @@ static void unplug_slaves(mddev_t *mddev)
583 584
584 rcu_read_lock(); 585 rcu_read_lock();
585 for (i=0; i<mddev->raid_disks; i++) { 586 for (i=0; i<mddev->raid_disks; i++) {
586 mdk_rdev_t *rdev = conf->mirrors[i].rdev; 587 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
587 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { 588 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
588 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 589 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
589 590
590 atomic_inc(&rdev->nr_pending); 591 atomic_inc(&rdev->nr_pending);
@@ -614,8 +615,8 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
614 615
615 rcu_read_lock(); 616 rcu_read_lock();
616 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 617 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
617 mdk_rdev_t *rdev = conf->mirrors[i].rdev; 618 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
618 if (rdev && !rdev->faulty) { 619 if (rdev && !test_bit(Faulty, &rdev->flags)) {
619 struct block_device *bdev = rdev->bdev; 620 struct block_device *bdev = rdev->bdev;
620 request_queue_t *r_queue = bdev_get_queue(bdev); 621 request_queue_t *r_queue = bdev_get_queue(bdev);
621 622
@@ -768,9 +769,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
768 rcu_read_lock(); 769 rcu_read_lock();
769 for (i = 0; i < conf->copies; i++) { 770 for (i = 0; i < conf->copies; i++) {
770 int d = r10_bio->devs[i].devnum; 771 int d = r10_bio->devs[i].devnum;
771 if (conf->mirrors[d].rdev && 772 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
772 !conf->mirrors[d].rdev->faulty) { 773 if (rdev &&
773 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 774 !test_bit(Faulty, &rdev->flags)) {
775 atomic_inc(&rdev->nr_pending);
774 r10_bio->devs[i].bio = bio; 776 r10_bio->devs[i].bio = bio;
775 } else 777 } else
776 r10_bio->devs[i].bio = NULL; 778 r10_bio->devs[i].bio = NULL;
@@ -824,7 +826,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
824 for (i = 0; i < conf->raid_disks; i++) 826 for (i = 0; i < conf->raid_disks; i++)
825 seq_printf(seq, "%s", 827 seq_printf(seq, "%s",
826 conf->mirrors[i].rdev && 828 conf->mirrors[i].rdev &&
827 conf->mirrors[i].rdev->in_sync ? "U" : "_"); 829 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
828 seq_printf(seq, "]"); 830 seq_printf(seq, "]");
829} 831}
830 832
@@ -839,7 +841,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
839 * next level up know. 841 * next level up know.
840 * else mark the drive as failed 842 * else mark the drive as failed
841 */ 843 */
842 if (rdev->in_sync 844 if (test_bit(In_sync, &rdev->flags)
843 && conf->working_disks == 1) 845 && conf->working_disks == 1)
844 /* 846 /*
845 * Don't fail the drive, just return an IO error. 847 * Don't fail the drive, just return an IO error.
@@ -849,7 +851,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
849 * really dead" tests... 851 * really dead" tests...
850 */ 852 */
851 return; 853 return;
852 if (rdev->in_sync) { 854 if (test_bit(In_sync, &rdev->flags)) {
853 mddev->degraded++; 855 mddev->degraded++;
854 conf->working_disks--; 856 conf->working_disks--;
855 /* 857 /*
@@ -857,8 +859,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
857 */ 859 */
858 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 860 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
859 } 861 }
860 rdev->in_sync = 0; 862 clear_bit(In_sync, &rdev->flags);
861 rdev->faulty = 1; 863 set_bit(Faulty, &rdev->flags);
862 mddev->sb_dirty = 1; 864 mddev->sb_dirty = 1;
863 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" 865 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
864 " Operation continuing on %d devices\n", 866 " Operation continuing on %d devices\n",
@@ -883,7 +885,8 @@ static void print_conf(conf_t *conf)
883 tmp = conf->mirrors + i; 885 tmp = conf->mirrors + i;
884 if (tmp->rdev) 886 if (tmp->rdev)
885 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 887 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
886 i, !tmp->rdev->in_sync, !tmp->rdev->faulty, 888 i, !test_bit(In_sync, &tmp->rdev->flags),
889 !test_bit(Faulty, &tmp->rdev->flags),
887 bdevname(tmp->rdev->bdev,b)); 890 bdevname(tmp->rdev->bdev,b));
888 } 891 }
889} 892}
@@ -936,11 +939,11 @@ static int raid10_spare_active(mddev_t *mddev)
936 for (i = 0; i < conf->raid_disks; i++) { 939 for (i = 0; i < conf->raid_disks; i++) {
937 tmp = conf->mirrors + i; 940 tmp = conf->mirrors + i;
938 if (tmp->rdev 941 if (tmp->rdev
939 && !tmp->rdev->faulty 942 && !test_bit(Faulty, &tmp->rdev->flags)
940 && !tmp->rdev->in_sync) { 943 && !test_bit(In_sync, &tmp->rdev->flags)) {
941 conf->working_disks++; 944 conf->working_disks++;
942 mddev->degraded--; 945 mddev->degraded--;
943 tmp->rdev->in_sync = 1; 946 set_bit(In_sync, &tmp->rdev->flags);
944 } 947 }
945 } 948 }
946 949
@@ -980,7 +983,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
980 p->head_position = 0; 983 p->head_position = 0;
981 rdev->raid_disk = mirror; 984 rdev->raid_disk = mirror;
982 found = 1; 985 found = 1;
983 p->rdev = rdev; 986 rcu_assign_pointer(p->rdev, rdev);
984 break; 987 break;
985 } 988 }
986 989
@@ -998,7 +1001,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
998 print_conf(conf); 1001 print_conf(conf);
999 rdev = p->rdev; 1002 rdev = p->rdev;
1000 if (rdev) { 1003 if (rdev) {
1001 if (rdev->in_sync || 1004 if (test_bit(In_sync, &rdev->flags) ||
1002 atomic_read(&rdev->nr_pending)) { 1005 atomic_read(&rdev->nr_pending)) {
1003 err = -EBUSY; 1006 err = -EBUSY;
1004 goto abort; 1007 goto abort;
@@ -1414,7 +1417,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1414 1417
1415 for (i=0 ; i<conf->raid_disks; i++) 1418 for (i=0 ; i<conf->raid_disks; i++)
1416 if (conf->mirrors[i].rdev && 1419 if (conf->mirrors[i].rdev &&
1417 !conf->mirrors[i].rdev->in_sync) { 1420 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1418 /* want to reconstruct this device */ 1421 /* want to reconstruct this device */
1419 r10bio_t *rb2 = r10_bio; 1422 r10bio_t *rb2 = r10_bio;
1420 1423
@@ -1435,7 +1438,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1435 for (j=0; j<conf->copies;j++) { 1438 for (j=0; j<conf->copies;j++) {
1436 int d = r10_bio->devs[j].devnum; 1439 int d = r10_bio->devs[j].devnum;
1437 if (conf->mirrors[d].rdev && 1440 if (conf->mirrors[d].rdev &&
1438 conf->mirrors[d].rdev->in_sync) { 1441 test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1439 /* This is where we read from */ 1442 /* This is where we read from */
1440 bio = r10_bio->devs[0].bio; 1443 bio = r10_bio->devs[0].bio;
1441 bio->bi_next = biolist; 1444 bio->bi_next = biolist;
@@ -1511,7 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1511 bio = r10_bio->devs[i].bio; 1514 bio = r10_bio->devs[i].bio;
1512 bio->bi_end_io = NULL; 1515 bio->bi_end_io = NULL;
1513 if (conf->mirrors[d].rdev == NULL || 1516 if (conf->mirrors[d].rdev == NULL ||
1514 conf->mirrors[d].rdev->faulty) 1517 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1515 continue; 1518 continue;
1516 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1519 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1517 atomic_inc(&r10_bio->remaining); 1520 atomic_inc(&r10_bio->remaining);
@@ -1697,7 +1700,7 @@ static int run(mddev_t *mddev)
1697 mddev->queue->max_sectors = (PAGE_SIZE>>9); 1700 mddev->queue->max_sectors = (PAGE_SIZE>>9);
1698 1701
1699 disk->head_position = 0; 1702 disk->head_position = 0;
1700 if (!rdev->faulty && rdev->in_sync) 1703 if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
1701 conf->working_disks++; 1704 conf->working_disks++;
1702 } 1705 }
1703 conf->raid_disks = mddev->raid_disks; 1706 conf->raid_disks = mddev->raid_disks;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1223e98ecd70..e2a40283e323 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -293,9 +293,31 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
293 return sh; 293 return sh;
294} 294}
295 295
296static int grow_stripes(raid5_conf_t *conf, int num) 296static int grow_one_stripe(raid5_conf_t *conf)
297{ 297{
298 struct stripe_head *sh; 298 struct stripe_head *sh;
299 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
300 if (!sh)
301 return 0;
302 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
303 sh->raid_conf = conf;
304 spin_lock_init(&sh->lock);
305
306 if (grow_buffers(sh, conf->raid_disks)) {
307 shrink_buffers(sh, conf->raid_disks);
308 kmem_cache_free(conf->slab_cache, sh);
309 return 0;
310 }
311 /* we just created an active stripe so... */
312 atomic_set(&sh->count, 1);
313 atomic_inc(&conf->active_stripes);
314 INIT_LIST_HEAD(&sh->lru);
315 release_stripe(sh);
316 return 1;
317}
318
319static int grow_stripes(raid5_conf_t *conf, int num)
320{
299 kmem_cache_t *sc; 321 kmem_cache_t *sc;
300 int devs = conf->raid_disks; 322 int devs = conf->raid_disks;
301 323
@@ -308,48 +330,39 @@ static int grow_stripes(raid5_conf_t *conf, int num)
308 return 1; 330 return 1;
309 conf->slab_cache = sc; 331 conf->slab_cache = sc;
310 while (num--) { 332 while (num--) {
311 sh = kmem_cache_alloc(sc, GFP_KERNEL); 333 if (!grow_one_stripe(conf))
312 if (!sh)
313 return 1;
314 memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
315 sh->raid_conf = conf;
316 spin_lock_init(&sh->lock);
317
318 if (grow_buffers(sh, conf->raid_disks)) {
319 shrink_buffers(sh, conf->raid_disks);
320 kmem_cache_free(sc, sh);
321 return 1; 334 return 1;
322 }
323 /* we just created an active stripe so... */
324 atomic_set(&sh->count, 1);
325 atomic_inc(&conf->active_stripes);
326 INIT_LIST_HEAD(&sh->lru);
327 release_stripe(sh);
328 } 335 }
329 return 0; 336 return 0;
330} 337}
331 338
332static void shrink_stripes(raid5_conf_t *conf) 339static int drop_one_stripe(raid5_conf_t *conf)
333{ 340{
334 struct stripe_head *sh; 341 struct stripe_head *sh;
335 342
336 while (1) { 343 spin_lock_irq(&conf->device_lock);
337 spin_lock_irq(&conf->device_lock); 344 sh = get_free_stripe(conf);
338 sh = get_free_stripe(conf); 345 spin_unlock_irq(&conf->device_lock);
339 spin_unlock_irq(&conf->device_lock); 346 if (!sh)
340 if (!sh) 347 return 0;
341 break; 348 if (atomic_read(&sh->count))
342 if (atomic_read(&sh->count)) 349 BUG();
343 BUG(); 350 shrink_buffers(sh, conf->raid_disks);
344 shrink_buffers(sh, conf->raid_disks); 351 kmem_cache_free(conf->slab_cache, sh);
345 kmem_cache_free(conf->slab_cache, sh); 352 atomic_dec(&conf->active_stripes);
346 atomic_dec(&conf->active_stripes); 353 return 1;
347 } 354}
355
356static void shrink_stripes(raid5_conf_t *conf)
357{
358 while (drop_one_stripe(conf))
359 ;
360
348 kmem_cache_destroy(conf->slab_cache); 361 kmem_cache_destroy(conf->slab_cache);
349 conf->slab_cache = NULL; 362 conf->slab_cache = NULL;
350} 363}
351 364
352static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done, 365static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
353 int error) 366 int error)
354{ 367{
355 struct stripe_head *sh = bi->bi_private; 368 struct stripe_head *sh = bi->bi_private;
@@ -401,10 +414,35 @@ static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done,
401 } 414 }
402#else 415#else
403 set_bit(R5_UPTODATE, &sh->dev[i].flags); 416 set_bit(R5_UPTODATE, &sh->dev[i].flags);
404#endif 417#endif
418 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
419 printk("R5: read error corrected!!\n");
420 clear_bit(R5_ReadError, &sh->dev[i].flags);
421 clear_bit(R5_ReWrite, &sh->dev[i].flags);
422 }
423 if (atomic_read(&conf->disks[i].rdev->read_errors))
424 atomic_set(&conf->disks[i].rdev->read_errors, 0);
405 } else { 425 } else {
406 md_error(conf->mddev, conf->disks[i].rdev); 426 int retry = 0;
407 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 427 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
428 atomic_inc(&conf->disks[i].rdev->read_errors);
429 if (conf->mddev->degraded)
430 printk("R5: read error not correctable.\n");
431 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
432 /* Oh, no!!! */
433 printk("R5: read error NOT corrected!!\n");
434 else if (atomic_read(&conf->disks[i].rdev->read_errors)
435 > conf->max_nr_stripes)
436 printk("raid5: Too many read errors, failing device.\n");
437 else
438 retry = 1;
439 if (retry)
440 set_bit(R5_ReadError, &sh->dev[i].flags);
441 else {
442 clear_bit(R5_ReadError, &sh->dev[i].flags);
443 clear_bit(R5_ReWrite, &sh->dev[i].flags);
444 md_error(conf->mddev, conf->disks[i].rdev);
445 }
408 } 446 }
409 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 447 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
410#if 0 448#if 0
@@ -487,19 +525,19 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
487 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 525 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
488 PRINTK("raid5: error called\n"); 526 PRINTK("raid5: error called\n");
489 527
490 if (!rdev->faulty) { 528 if (!test_bit(Faulty, &rdev->flags)) {
491 mddev->sb_dirty = 1; 529 mddev->sb_dirty = 1;
492 if (rdev->in_sync) { 530 if (test_bit(In_sync, &rdev->flags)) {
493 conf->working_disks--; 531 conf->working_disks--;
494 mddev->degraded++; 532 mddev->degraded++;
495 conf->failed_disks++; 533 conf->failed_disks++;
496 rdev->in_sync = 0; 534 clear_bit(In_sync, &rdev->flags);
497 /* 535 /*
498 * if recovery was running, make sure it aborts. 536 * if recovery was running, make sure it aborts.
499 */ 537 */
500 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 538 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
501 } 539 }
502 rdev->faulty = 1; 540 set_bit(Faulty, &rdev->flags);
503 printk (KERN_ALERT 541 printk (KERN_ALERT
504 "raid5: Disk failure on %s, disabling device." 542 "raid5: Disk failure on %s, disabling device."
505 " Operation continuing on %d devices\n", 543 " Operation continuing on %d devices\n",
@@ -965,7 +1003,13 @@ static void handle_stripe(struct stripe_head *sh)
965 } 1003 }
966 if (dev->written) written++; 1004 if (dev->written) written++;
967 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ 1005 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
968 if (!rdev || !rdev->in_sync) { 1006 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1007 /* The ReadError flag wil just be confusing now */
1008 clear_bit(R5_ReadError, &dev->flags);
1009 clear_bit(R5_ReWrite, &dev->flags);
1010 }
1011 if (!rdev || !test_bit(In_sync, &rdev->flags)
1012 || test_bit(R5_ReadError, &dev->flags)) {
969 failed++; 1013 failed++;
970 failed_num = i; 1014 failed_num = i;
971 } else 1015 } else
@@ -980,6 +1024,14 @@ static void handle_stripe(struct stripe_head *sh)
980 if (failed > 1 && to_read+to_write+written) { 1024 if (failed > 1 && to_read+to_write+written) {
981 for (i=disks; i--; ) { 1025 for (i=disks; i--; ) {
982 int bitmap_end = 0; 1026 int bitmap_end = 0;
1027
1028 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1029 mdk_rdev_t *rdev = conf->disks[i].rdev;
1030 if (rdev && test_bit(In_sync, &rdev->flags))
1031 /* multiple read failures in one stripe */
1032 md_error(conf->mddev, rdev);
1033 }
1034
983 spin_lock_irq(&conf->device_lock); 1035 spin_lock_irq(&conf->device_lock);
984 /* fail all writes first */ 1036 /* fail all writes first */
985 bi = sh->dev[i].towrite; 1037 bi = sh->dev[i].towrite;
@@ -1015,7 +1067,8 @@ static void handle_stripe(struct stripe_head *sh)
1015 } 1067 }
1016 1068
1017 /* fail any reads if this device is non-operational */ 1069 /* fail any reads if this device is non-operational */
1018 if (!test_bit(R5_Insync, &sh->dev[i].flags)) { 1070 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1071 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1019 bi = sh->dev[i].toread; 1072 bi = sh->dev[i].toread;
1020 sh->dev[i].toread = NULL; 1073 sh->dev[i].toread = NULL;
1021 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1074 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
@@ -1247,6 +1300,11 @@ static void handle_stripe(struct stripe_head *sh)
1247 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { 1300 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1248 /* parity is correct (on disc, not in buffer any more) */ 1301 /* parity is correct (on disc, not in buffer any more) */
1249 set_bit(STRIPE_INSYNC, &sh->state); 1302 set_bit(STRIPE_INSYNC, &sh->state);
1303 } else {
1304 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1305 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1306 /* don't try to repair!! */
1307 set_bit(STRIPE_INSYNC, &sh->state);
1250 } 1308 }
1251 } 1309 }
1252 if (!test_bit(STRIPE_INSYNC, &sh->state)) { 1310 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -1274,7 +1332,27 @@ static void handle_stripe(struct stripe_head *sh)
1274 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 1332 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1275 clear_bit(STRIPE_SYNCING, &sh->state); 1333 clear_bit(STRIPE_SYNCING, &sh->state);
1276 } 1334 }
1277 1335
1336 /* If the failed drive is just a ReadError, then we might need to progress
1337 * the repair/check process
1338 */
1339 if (failed == 1 && ! conf->mddev->ro &&
1340 test_bit(R5_ReadError, &sh->dev[failed_num].flags)
1341 && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags)
1342 && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)
1343 ) {
1344 dev = &sh->dev[failed_num];
1345 if (!test_bit(R5_ReWrite, &dev->flags)) {
1346 set_bit(R5_Wantwrite, &dev->flags);
1347 set_bit(R5_ReWrite, &dev->flags);
1348 set_bit(R5_LOCKED, &dev->flags);
1349 } else {
1350 /* let's read it back */
1351 set_bit(R5_Wantread, &dev->flags);
1352 set_bit(R5_LOCKED, &dev->flags);
1353 }
1354 }
1355
1278 spin_unlock(&sh->lock); 1356 spin_unlock(&sh->lock);
1279 1357
1280 while ((bi=return_bi)) { 1358 while ((bi=return_bi)) {
@@ -1305,8 +1383,8 @@ static void handle_stripe(struct stripe_head *sh)
1305 bi->bi_end_io = raid5_end_read_request; 1383 bi->bi_end_io = raid5_end_read_request;
1306 1384
1307 rcu_read_lock(); 1385 rcu_read_lock();
1308 rdev = conf->disks[i].rdev; 1386 rdev = rcu_dereference(conf->disks[i].rdev);
1309 if (rdev && rdev->faulty) 1387 if (rdev && test_bit(Faulty, &rdev->flags))
1310 rdev = NULL; 1388 rdev = NULL;
1311 if (rdev) 1389 if (rdev)
1312 atomic_inc(&rdev->nr_pending); 1390 atomic_inc(&rdev->nr_pending);
@@ -1379,8 +1457,8 @@ static void unplug_slaves(mddev_t *mddev)
1379 1457
1380 rcu_read_lock(); 1458 rcu_read_lock();
1381 for (i=0; i<mddev->raid_disks; i++) { 1459 for (i=0; i<mddev->raid_disks; i++) {
1382 mdk_rdev_t *rdev = conf->disks[i].rdev; 1460 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1383 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { 1461 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
1384 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 1462 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
1385 1463
1386 atomic_inc(&rdev->nr_pending); 1464 atomic_inc(&rdev->nr_pending);
@@ -1424,8 +1502,8 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
1424 1502
1425 rcu_read_lock(); 1503 rcu_read_lock();
1426 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 1504 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
1427 mdk_rdev_t *rdev = conf->disks[i].rdev; 1505 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1428 if (rdev && !rdev->faulty) { 1506 if (rdev && !test_bit(Faulty, &rdev->flags)) {
1429 struct block_device *bdev = rdev->bdev; 1507 struct block_device *bdev = rdev->bdev;
1430 request_queue_t *r_queue = bdev_get_queue(bdev); 1508 request_queue_t *r_queue = bdev_get_queue(bdev);
1431 1509
@@ -1567,6 +1645,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1567 return rv; 1645 return rv;
1568 } 1646 }
1569 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 1647 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1648 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1570 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 1649 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1571 /* we can skip this block, and probably more */ 1650 /* we can skip this block, and probably more */
1572 sync_blocks /= STRIPE_SECTORS; 1651 sync_blocks /= STRIPE_SECTORS;
@@ -1663,6 +1742,74 @@ static void raid5d (mddev_t *mddev)
1663 PRINTK("--- raid5d inactive\n"); 1742 PRINTK("--- raid5d inactive\n");
1664} 1743}
1665 1744
1745static ssize_t
1746raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
1747{
1748 raid5_conf_t *conf = mddev_to_conf(mddev);
1749 if (conf)
1750 return sprintf(page, "%d\n", conf->max_nr_stripes);
1751 else
1752 return 0;
1753}
1754
1755static ssize_t
1756raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
1757{
1758 raid5_conf_t *conf = mddev_to_conf(mddev);
1759 char *end;
1760 int new;
1761 if (len >= PAGE_SIZE)
1762 return -EINVAL;
1763 if (!conf)
1764 return -ENODEV;
1765
1766 new = simple_strtoul(page, &end, 10);
1767 if (!*page || (*end && *end != '\n') )
1768 return -EINVAL;
1769 if (new <= 16 || new > 32768)
1770 return -EINVAL;
1771 while (new < conf->max_nr_stripes) {
1772 if (drop_one_stripe(conf))
1773 conf->max_nr_stripes--;
1774 else
1775 break;
1776 }
1777 while (new > conf->max_nr_stripes) {
1778 if (grow_one_stripe(conf))
1779 conf->max_nr_stripes++;
1780 else break;
1781 }
1782 return len;
1783}
1784
1785static struct md_sysfs_entry
1786raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
1787 raid5_show_stripe_cache_size,
1788 raid5_store_stripe_cache_size);
1789
1790static ssize_t
1791stripe_cache_active_show(mddev_t *mddev, char *page)
1792{
1793 raid5_conf_t *conf = mddev_to_conf(mddev);
1794 if (conf)
1795 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
1796 else
1797 return 0;
1798}
1799
1800static struct md_sysfs_entry
1801raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
1802
1803static struct attribute *raid5_attrs[] = {
1804 &raid5_stripecache_size.attr,
1805 &raid5_stripecache_active.attr,
1806 NULL,
1807};
1808static struct attribute_group raid5_attrs_group = {
1809 .name = NULL,
1810 .attrs = raid5_attrs,
1811};
1812
1666static int run(mddev_t *mddev) 1813static int run(mddev_t *mddev)
1667{ 1814{
1668 raid5_conf_t *conf; 1815 raid5_conf_t *conf;
@@ -1709,7 +1856,7 @@ static int run(mddev_t *mddev)
1709 1856
1710 disk->rdev = rdev; 1857 disk->rdev = rdev;
1711 1858
1712 if (rdev->in_sync) { 1859 if (test_bit(In_sync, &rdev->flags)) {
1713 char b[BDEVNAME_SIZE]; 1860 char b[BDEVNAME_SIZE];
1714 printk(KERN_INFO "raid5: device %s operational as raid" 1861 printk(KERN_INFO "raid5: device %s operational as raid"
1715 " disk %d\n", bdevname(rdev->bdev,b), 1862 " disk %d\n", bdevname(rdev->bdev,b),
@@ -1804,6 +1951,7 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1804 } 1951 }
1805 1952
1806 /* Ok, everything is just fine now */ 1953 /* Ok, everything is just fine now */
1954 sysfs_create_group(&mddev->kobj, &raid5_attrs_group);
1807 1955
1808 if (mddev->bitmap) 1956 if (mddev->bitmap)
1809 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; 1957 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
@@ -1828,7 +1976,7 @@ abort:
1828 1976
1829 1977
1830 1978
1831static int stop (mddev_t *mddev) 1979static int stop(mddev_t *mddev)
1832{ 1980{
1833 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1981 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1834 1982
@@ -1837,6 +1985,7 @@ static int stop (mddev_t *mddev)
1837 shrink_stripes(conf); 1985 shrink_stripes(conf);
1838 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); 1986 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1839 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1987 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1988 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
1840 kfree(conf); 1989 kfree(conf);
1841 mddev->private = NULL; 1990 mddev->private = NULL;
1842 return 0; 1991 return 0;
@@ -1887,7 +2036,7 @@ static void status (struct seq_file *seq, mddev_t *mddev)
1887 for (i = 0; i < conf->raid_disks; i++) 2036 for (i = 0; i < conf->raid_disks; i++)
1888 seq_printf (seq, "%s", 2037 seq_printf (seq, "%s",
1889 conf->disks[i].rdev && 2038 conf->disks[i].rdev &&
1890 conf->disks[i].rdev->in_sync ? "U" : "_"); 2039 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
1891 seq_printf (seq, "]"); 2040 seq_printf (seq, "]");
1892#if RAID5_DEBUG 2041#if RAID5_DEBUG
1893#define D(x) \ 2042#define D(x) \
@@ -1914,7 +2063,7 @@ static void print_raid5_conf (raid5_conf_t *conf)
1914 tmp = conf->disks + i; 2063 tmp = conf->disks + i;
1915 if (tmp->rdev) 2064 if (tmp->rdev)
1916 printk(" disk %d, o:%d, dev:%s\n", 2065 printk(" disk %d, o:%d, dev:%s\n",
1917 i, !tmp->rdev->faulty, 2066 i, !test_bit(Faulty, &tmp->rdev->flags),
1918 bdevname(tmp->rdev->bdev,b)); 2067 bdevname(tmp->rdev->bdev,b));
1919 } 2068 }
1920} 2069}
@@ -1928,12 +2077,12 @@ static int raid5_spare_active(mddev_t *mddev)
1928 for (i = 0; i < conf->raid_disks; i++) { 2077 for (i = 0; i < conf->raid_disks; i++) {
1929 tmp = conf->disks + i; 2078 tmp = conf->disks + i;
1930 if (tmp->rdev 2079 if (tmp->rdev
1931 && !tmp->rdev->faulty 2080 && !test_bit(Faulty, &tmp->rdev->flags)
1932 && !tmp->rdev->in_sync) { 2081 && !test_bit(In_sync, &tmp->rdev->flags)) {
1933 mddev->degraded--; 2082 mddev->degraded--;
1934 conf->failed_disks--; 2083 conf->failed_disks--;
1935 conf->working_disks++; 2084 conf->working_disks++;
1936 tmp->rdev->in_sync = 1; 2085 set_bit(In_sync, &tmp->rdev->flags);
1937 } 2086 }
1938 } 2087 }
1939 print_raid5_conf(conf); 2088 print_raid5_conf(conf);
@@ -1950,7 +2099,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
1950 print_raid5_conf(conf); 2099 print_raid5_conf(conf);
1951 rdev = p->rdev; 2100 rdev = p->rdev;
1952 if (rdev) { 2101 if (rdev) {
1953 if (rdev->in_sync || 2102 if (test_bit(In_sync, &rdev->flags) ||
1954 atomic_read(&rdev->nr_pending)) { 2103 atomic_read(&rdev->nr_pending)) {
1955 err = -EBUSY; 2104 err = -EBUSY;
1956 goto abort; 2105 goto abort;
@@ -1985,12 +2134,12 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1985 */ 2134 */
1986 for (disk=0; disk < mddev->raid_disks; disk++) 2135 for (disk=0; disk < mddev->raid_disks; disk++)
1987 if ((p=conf->disks + disk)->rdev == NULL) { 2136 if ((p=conf->disks + disk)->rdev == NULL) {
1988 rdev->in_sync = 0; 2137 clear_bit(In_sync, &rdev->flags);
1989 rdev->raid_disk = disk; 2138 rdev->raid_disk = disk;
1990 found = 1; 2139 found = 1;
1991 if (rdev->saved_raid_disk != disk) 2140 if (rdev->saved_raid_disk != disk)
1992 conf->fullsync = 1; 2141 conf->fullsync = 1;
1993 p->rdev = rdev; 2142 rcu_assign_pointer(p->rdev, rdev);
1994 break; 2143 break;
1995 } 2144 }
1996 print_raid5_conf(conf); 2145 print_raid5_conf(conf);
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 775786947701..eae5a35629c5 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -507,19 +507,19 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
507 raid6_conf_t *conf = (raid6_conf_t *) mddev->private; 507 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
508 PRINTK("raid6: error called\n"); 508 PRINTK("raid6: error called\n");
509 509
510 if (!rdev->faulty) { 510 if (!test_bit(Faulty, &rdev->flags)) {
511 mddev->sb_dirty = 1; 511 mddev->sb_dirty = 1;
512 if (rdev->in_sync) { 512 if (test_bit(In_sync, &rdev->flags)) {
513 conf->working_disks--; 513 conf->working_disks--;
514 mddev->degraded++; 514 mddev->degraded++;
515 conf->failed_disks++; 515 conf->failed_disks++;
516 rdev->in_sync = 0; 516 clear_bit(In_sync, &rdev->flags);
517 /* 517 /*
518 * if recovery was running, make sure it aborts. 518 * if recovery was running, make sure it aborts.
519 */ 519 */
520 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 520 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
521 } 521 }
522 rdev->faulty = 1; 522 set_bit(Faulty, &rdev->flags);
523 printk (KERN_ALERT 523 printk (KERN_ALERT
524 "raid6: Disk failure on %s, disabling device." 524 "raid6: Disk failure on %s, disabling device."
525 " Operation continuing on %d devices\n", 525 " Operation continuing on %d devices\n",
@@ -1071,7 +1071,7 @@ static void handle_stripe(struct stripe_head *sh)
1071 } 1071 }
1072 if (dev->written) written++; 1072 if (dev->written) written++;
1073 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ 1073 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
1074 if (!rdev || !rdev->in_sync) { 1074 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1075 if ( failed < 2 ) 1075 if ( failed < 2 )
1076 failed_num[failed] = i; 1076 failed_num[failed] = i;
1077 failed++; 1077 failed++;
@@ -1464,8 +1464,8 @@ static void handle_stripe(struct stripe_head *sh)
1464 bi->bi_end_io = raid6_end_read_request; 1464 bi->bi_end_io = raid6_end_read_request;
1465 1465
1466 rcu_read_lock(); 1466 rcu_read_lock();
1467 rdev = conf->disks[i].rdev; 1467 rdev = rcu_dereference(conf->disks[i].rdev);
1468 if (rdev && rdev->faulty) 1468 if (rdev && test_bit(Faulty, &rdev->flags))
1469 rdev = NULL; 1469 rdev = NULL;
1470 if (rdev) 1470 if (rdev)
1471 atomic_inc(&rdev->nr_pending); 1471 atomic_inc(&rdev->nr_pending);
@@ -1538,8 +1538,8 @@ static void unplug_slaves(mddev_t *mddev)
1538 1538
1539 rcu_read_lock(); 1539 rcu_read_lock();
1540 for (i=0; i<mddev->raid_disks; i++) { 1540 for (i=0; i<mddev->raid_disks; i++) {
1541 mdk_rdev_t *rdev = conf->disks[i].rdev; 1541 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1542 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { 1542 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
1543 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 1543 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
1544 1544
1545 atomic_inc(&rdev->nr_pending); 1545 atomic_inc(&rdev->nr_pending);
@@ -1583,8 +1583,8 @@ static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
1583 1583
1584 rcu_read_lock(); 1584 rcu_read_lock();
1585 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 1585 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
1586 mdk_rdev_t *rdev = conf->disks[i].rdev; 1586 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
1587 if (rdev && !rdev->faulty) { 1587 if (rdev && !test_bit(Faulty, &rdev->flags)) {
1588 struct block_device *bdev = rdev->bdev; 1588 struct block_device *bdev = rdev->bdev;
1589 request_queue_t *r_queue = bdev_get_queue(bdev); 1589 request_queue_t *r_queue = bdev_get_queue(bdev);
1590 1590
@@ -1868,7 +1868,7 @@ static int run(mddev_t *mddev)
1868 1868
1869 disk->rdev = rdev; 1869 disk->rdev = rdev;
1870 1870
1871 if (rdev->in_sync) { 1871 if (test_bit(In_sync, &rdev->flags)) {
1872 char b[BDEVNAME_SIZE]; 1872 char b[BDEVNAME_SIZE];
1873 printk(KERN_INFO "raid6: device %s operational as raid" 1873 printk(KERN_INFO "raid6: device %s operational as raid"
1874 " disk %d\n", bdevname(rdev->bdev,b), 1874 " disk %d\n", bdevname(rdev->bdev,b),
@@ -2052,7 +2052,7 @@ static void status (struct seq_file *seq, mddev_t *mddev)
2052 for (i = 0; i < conf->raid_disks; i++) 2052 for (i = 0; i < conf->raid_disks; i++)
2053 seq_printf (seq, "%s", 2053 seq_printf (seq, "%s",
2054 conf->disks[i].rdev && 2054 conf->disks[i].rdev &&
2055 conf->disks[i].rdev->in_sync ? "U" : "_"); 2055 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
2056 seq_printf (seq, "]"); 2056 seq_printf (seq, "]");
2057#if RAID6_DUMPSTATE 2057#if RAID6_DUMPSTATE
2058 seq_printf (seq, "\n"); 2058 seq_printf (seq, "\n");
@@ -2078,7 +2078,7 @@ static void print_raid6_conf (raid6_conf_t *conf)
2078 tmp = conf->disks + i; 2078 tmp = conf->disks + i;
2079 if (tmp->rdev) 2079 if (tmp->rdev)
2080 printk(" disk %d, o:%d, dev:%s\n", 2080 printk(" disk %d, o:%d, dev:%s\n",
2081 i, !tmp->rdev->faulty, 2081 i, !test_bit(Faulty, &tmp->rdev->flags),
2082 bdevname(tmp->rdev->bdev,b)); 2082 bdevname(tmp->rdev->bdev,b));
2083 } 2083 }
2084} 2084}
@@ -2092,12 +2092,12 @@ static int raid6_spare_active(mddev_t *mddev)
2092 for (i = 0; i < conf->raid_disks; i++) { 2092 for (i = 0; i < conf->raid_disks; i++) {
2093 tmp = conf->disks + i; 2093 tmp = conf->disks + i;
2094 if (tmp->rdev 2094 if (tmp->rdev
2095 && !tmp->rdev->faulty 2095 && !test_bit(Faulty, &tmp->rdev->flags)
2096 && !tmp->rdev->in_sync) { 2096 && !test_bit(In_sync, &tmp->rdev->flags)) {
2097 mddev->degraded--; 2097 mddev->degraded--;
2098 conf->failed_disks--; 2098 conf->failed_disks--;
2099 conf->working_disks++; 2099 conf->working_disks++;
2100 tmp->rdev->in_sync = 1; 2100 set_bit(In_sync, &tmp->rdev->flags);
2101 } 2101 }
2102 } 2102 }
2103 print_raid6_conf(conf); 2103 print_raid6_conf(conf);
@@ -2114,7 +2114,7 @@ static int raid6_remove_disk(mddev_t *mddev, int number)
2114 print_raid6_conf(conf); 2114 print_raid6_conf(conf);
2115 rdev = p->rdev; 2115 rdev = p->rdev;
2116 if (rdev) { 2116 if (rdev) {
2117 if (rdev->in_sync || 2117 if (test_bit(In_sync, &rdev->flags) ||
2118 atomic_read(&rdev->nr_pending)) { 2118 atomic_read(&rdev->nr_pending)) {
2119 err = -EBUSY; 2119 err = -EBUSY;
2120 goto abort; 2120 goto abort;
@@ -2149,12 +2149,12 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2149 */ 2149 */
2150 for (disk=0; disk < mddev->raid_disks; disk++) 2150 for (disk=0; disk < mddev->raid_disks; disk++)
2151 if ((p=conf->disks + disk)->rdev == NULL) { 2151 if ((p=conf->disks + disk)->rdev == NULL) {
2152 rdev->in_sync = 0; 2152 clear_bit(In_sync, &rdev->flags);
2153 rdev->raid_disk = disk; 2153 rdev->raid_disk = disk;
2154 found = 1; 2154 found = 1;
2155 if (rdev->saved_raid_disk != disk) 2155 if (rdev->saved_raid_disk != disk)
2156 conf->fullsync = 1; 2156 conf->fullsync = 1;
2157 p->rdev = rdev; 2157 rcu_assign_pointer(p->rdev, rdev);
2158 break; 2158 break;
2159 } 2159 }
2160 print_raid6_conf(conf); 2160 print_raid6_conf(conf);