aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c636
1 files changed, 520 insertions, 116 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9ecf51ee596f..adf960d8a7c9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -131,6 +131,8 @@ static ctl_table raid_root_table[] = {
131 131
132static struct block_device_operations md_fops; 132static struct block_device_operations md_fops;
133 133
134static int start_readonly;
135
134/* 136/*
135 * Enables to iterate over all existing md arrays 137 * Enables to iterate over all existing md arrays
136 * all_mddevs_lock protects this list. 138 * all_mddevs_lock protects this list.
@@ -181,7 +183,7 @@ static void mddev_put(mddev_t *mddev)
181 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 183 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
182 list_del(&mddev->all_mddevs); 184 list_del(&mddev->all_mddevs);
183 blk_put_queue(mddev->queue); 185 blk_put_queue(mddev->queue);
184 kfree(mddev); 186 kobject_unregister(&mddev->kobj);
185 } 187 }
186 spin_unlock(&all_mddevs_lock); 188 spin_unlock(&all_mddevs_lock);
187} 189}
@@ -330,18 +332,46 @@ static void free_disk_sb(mdk_rdev_t * rdev)
330static int super_written(struct bio *bio, unsigned int bytes_done, int error) 332static int super_written(struct bio *bio, unsigned int bytes_done, int error)
331{ 333{
332 mdk_rdev_t *rdev = bio->bi_private; 334 mdk_rdev_t *rdev = bio->bi_private;
335 mddev_t *mddev = rdev->mddev;
333 if (bio->bi_size) 336 if (bio->bi_size)
334 return 1; 337 return 1;
335 338
336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 339 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
337 md_error(rdev->mddev, rdev); 340 md_error(mddev, rdev);
338 341
339 if (atomic_dec_and_test(&rdev->mddev->pending_writes)) 342 if (atomic_dec_and_test(&mddev->pending_writes))
340 wake_up(&rdev->mddev->sb_wait); 343 wake_up(&mddev->sb_wait);
341 bio_put(bio); 344 bio_put(bio);
342 return 0; 345 return 0;
343} 346}
344 347
348static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
349{
350 struct bio *bio2 = bio->bi_private;
351 mdk_rdev_t *rdev = bio2->bi_private;
352 mddev_t *mddev = rdev->mddev;
353 if (bio->bi_size)
354 return 1;
355
356 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
357 error == -EOPNOTSUPP) {
358 unsigned long flags;
359 /* barriers don't appear to be supported :-( */
360 set_bit(BarriersNotsupp, &rdev->flags);
361 mddev->barriers_work = 0;
362 spin_lock_irqsave(&mddev->write_lock, flags);
363 bio2->bi_next = mddev->biolist;
364 mddev->biolist = bio2;
365 spin_unlock_irqrestore(&mddev->write_lock, flags);
366 wake_up(&mddev->sb_wait);
367 bio_put(bio);
368 return 0;
369 }
370 bio_put(bio2);
371 bio->bi_private = rdev;
372 return super_written(bio, bytes_done, error);
373}
374
345void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 375void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
346 sector_t sector, int size, struct page *page) 376 sector_t sector, int size, struct page *page)
347{ 377{
@@ -350,16 +380,54 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
350 * and decrement it on completion, waking up sb_wait 380 * and decrement it on completion, waking up sb_wait
351 * if zero is reached. 381 * if zero is reached.
352 * If an error occurred, call md_error 382 * If an error occurred, call md_error
383 *
384 * As we might need to resubmit the request if BIO_RW_BARRIER
385 * causes ENOTSUPP, we allocate a spare bio...
353 */ 386 */
354 struct bio *bio = bio_alloc(GFP_NOIO, 1); 387 struct bio *bio = bio_alloc(GFP_NOIO, 1);
388 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
355 389
356 bio->bi_bdev = rdev->bdev; 390 bio->bi_bdev = rdev->bdev;
357 bio->bi_sector = sector; 391 bio->bi_sector = sector;
358 bio_add_page(bio, page, size, 0); 392 bio_add_page(bio, page, size, 0);
359 bio->bi_private = rdev; 393 bio->bi_private = rdev;
360 bio->bi_end_io = super_written; 394 bio->bi_end_io = super_written;
395 bio->bi_rw = rw;
396
361 atomic_inc(&mddev->pending_writes); 397 atomic_inc(&mddev->pending_writes);
362 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); 398 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
399 struct bio *rbio;
400 rw |= (1<<BIO_RW_BARRIER);
401 rbio = bio_clone(bio, GFP_NOIO);
402 rbio->bi_private = bio;
403 rbio->bi_end_io = super_written_barrier;
404 submit_bio(rw, rbio);
405 } else
406 submit_bio(rw, bio);
407}
408
409void md_super_wait(mddev_t *mddev)
410{
411 /* wait for all superblock writes that were scheduled to complete.
412 * if any had to be retried (due to BARRIER problems), retry them
413 */
414 DEFINE_WAIT(wq);
415 for(;;) {
416 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
417 if (atomic_read(&mddev->pending_writes)==0)
418 break;
419 while (mddev->biolist) {
420 struct bio *bio;
421 spin_lock_irq(&mddev->write_lock);
422 bio = mddev->biolist;
423 mddev->biolist = bio->bi_next ;
424 bio->bi_next = NULL;
425 spin_unlock_irq(&mddev->write_lock);
426 submit_bio(bio->bi_rw, bio);
427 }
428 schedule();
429 }
430 finish_wait(&mddev->sb_wait, &wq);
363} 431}
364 432
365static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 433static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
@@ -610,7 +678,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
610 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 678 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
611 679
612 rdev->raid_disk = -1; 680 rdev->raid_disk = -1;
613 rdev->in_sync = 0; 681 rdev->flags = 0;
614 if (mddev->raid_disks == 0) { 682 if (mddev->raid_disks == 0) {
615 mddev->major_version = 0; 683 mddev->major_version = 0;
616 mddev->minor_version = sb->minor_version; 684 mddev->minor_version = sb->minor_version;
@@ -671,21 +739,19 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
671 return 0; 739 return 0;
672 740
673 if (mddev->level != LEVEL_MULTIPATH) { 741 if (mddev->level != LEVEL_MULTIPATH) {
674 rdev->faulty = 0;
675 rdev->flags = 0;
676 desc = sb->disks + rdev->desc_nr; 742 desc = sb->disks + rdev->desc_nr;
677 743
678 if (desc->state & (1<<MD_DISK_FAULTY)) 744 if (desc->state & (1<<MD_DISK_FAULTY))
679 rdev->faulty = 1; 745 set_bit(Faulty, &rdev->flags);
680 else if (desc->state & (1<<MD_DISK_SYNC) && 746 else if (desc->state & (1<<MD_DISK_SYNC) &&
681 desc->raid_disk < mddev->raid_disks) { 747 desc->raid_disk < mddev->raid_disks) {
682 rdev->in_sync = 1; 748 set_bit(In_sync, &rdev->flags);
683 rdev->raid_disk = desc->raid_disk; 749 rdev->raid_disk = desc->raid_disk;
684 } 750 }
685 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 751 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
686 set_bit(WriteMostly, &rdev->flags); 752 set_bit(WriteMostly, &rdev->flags);
687 } else /* MULTIPATH are always insync */ 753 } else /* MULTIPATH are always insync */
688 rdev->in_sync = 1; 754 set_bit(In_sync, &rdev->flags);
689 return 0; 755 return 0;
690} 756}
691 757
@@ -699,6 +765,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
699 mdk_rdev_t *rdev2; 765 mdk_rdev_t *rdev2;
700 int next_spare = mddev->raid_disks; 766 int next_spare = mddev->raid_disks;
701 767
768
702 /* make rdev->sb match mddev data.. 769 /* make rdev->sb match mddev data..
703 * 770 *
704 * 1/ zero out disks 771 * 1/ zero out disks
@@ -758,23 +825,27 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
758 sb->disks[0].state = (1<<MD_DISK_REMOVED); 825 sb->disks[0].state = (1<<MD_DISK_REMOVED);
759 ITERATE_RDEV(mddev,rdev2,tmp) { 826 ITERATE_RDEV(mddev,rdev2,tmp) {
760 mdp_disk_t *d; 827 mdp_disk_t *d;
761 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 828 int desc_nr;
762 rdev2->desc_nr = rdev2->raid_disk; 829 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
830 && !test_bit(Faulty, &rdev2->flags))
831 desc_nr = rdev2->raid_disk;
763 else 832 else
764 rdev2->desc_nr = next_spare++; 833 desc_nr = next_spare++;
834 rdev2->desc_nr = desc_nr;
765 d = &sb->disks[rdev2->desc_nr]; 835 d = &sb->disks[rdev2->desc_nr];
766 nr_disks++; 836 nr_disks++;
767 d->number = rdev2->desc_nr; 837 d->number = rdev2->desc_nr;
768 d->major = MAJOR(rdev2->bdev->bd_dev); 838 d->major = MAJOR(rdev2->bdev->bd_dev);
769 d->minor = MINOR(rdev2->bdev->bd_dev); 839 d->minor = MINOR(rdev2->bdev->bd_dev);
770 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) 840 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
841 && !test_bit(Faulty, &rdev2->flags))
771 d->raid_disk = rdev2->raid_disk; 842 d->raid_disk = rdev2->raid_disk;
772 else 843 else
773 d->raid_disk = rdev2->desc_nr; /* compatibility */ 844 d->raid_disk = rdev2->desc_nr; /* compatibility */
774 if (rdev2->faulty) { 845 if (test_bit(Faulty, &rdev2->flags)) {
775 d->state = (1<<MD_DISK_FAULTY); 846 d->state = (1<<MD_DISK_FAULTY);
776 failed++; 847 failed++;
777 } else if (rdev2->in_sync) { 848 } else if (test_bit(In_sync, &rdev2->flags)) {
778 d->state = (1<<MD_DISK_ACTIVE); 849 d->state = (1<<MD_DISK_ACTIVE);
779 d->state |= (1<<MD_DISK_SYNC); 850 d->state |= (1<<MD_DISK_SYNC);
780 active++; 851 active++;
@@ -787,7 +858,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
787 if (test_bit(WriteMostly, &rdev2->flags)) 858 if (test_bit(WriteMostly, &rdev2->flags))
788 d->state |= (1<<MD_DISK_WRITEMOSTLY); 859 d->state |= (1<<MD_DISK_WRITEMOSTLY);
789 } 860 }
790
791 /* now set the "removed" and "faulty" bits on any missing devices */ 861 /* now set the "removed" and "faulty" bits on any missing devices */
792 for (i=0 ; i < mddev->raid_disks ; i++) { 862 for (i=0 ; i < mddev->raid_disks ; i++) {
793 mdp_disk_t *d = &sb->disks[i]; 863 mdp_disk_t *d = &sb->disks[i];
@@ -944,7 +1014,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
944 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1014 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
945 1015
946 rdev->raid_disk = -1; 1016 rdev->raid_disk = -1;
947 rdev->in_sync = 0; 1017 rdev->flags = 0;
948 if (mddev->raid_disks == 0) { 1018 if (mddev->raid_disks == 0) {
949 mddev->major_version = 1; 1019 mddev->major_version = 1;
950 mddev->patch_version = 0; 1020 mddev->patch_version = 0;
@@ -996,22 +1066,19 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
996 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1066 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
997 switch(role) { 1067 switch(role) {
998 case 0xffff: /* spare */ 1068 case 0xffff: /* spare */
999 rdev->faulty = 0;
1000 break; 1069 break;
1001 case 0xfffe: /* faulty */ 1070 case 0xfffe: /* faulty */
1002 rdev->faulty = 1; 1071 set_bit(Faulty, &rdev->flags);
1003 break; 1072 break;
1004 default: 1073 default:
1005 rdev->in_sync = 1; 1074 set_bit(In_sync, &rdev->flags);
1006 rdev->faulty = 0;
1007 rdev->raid_disk = role; 1075 rdev->raid_disk = role;
1008 break; 1076 break;
1009 } 1077 }
1010 rdev->flags = 0;
1011 if (sb->devflags & WriteMostly1) 1078 if (sb->devflags & WriteMostly1)
1012 set_bit(WriteMostly, &rdev->flags); 1079 set_bit(WriteMostly, &rdev->flags);
1013 } else /* MULTIPATH are always insync */ 1080 } else /* MULTIPATH are always insync */
1014 rdev->in_sync = 1; 1081 set_bit(In_sync, &rdev->flags);
1015 1082
1016 return 0; 1083 return 0;
1017} 1084}
@@ -1055,9 +1122,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1055 1122
1056 ITERATE_RDEV(mddev,rdev2,tmp) { 1123 ITERATE_RDEV(mddev,rdev2,tmp) {
1057 i = rdev2->desc_nr; 1124 i = rdev2->desc_nr;
1058 if (rdev2->faulty) 1125 if (test_bit(Faulty, &rdev2->flags))
1059 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1126 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1060 else if (rdev2->in_sync) 1127 else if (test_bit(In_sync, &rdev2->flags))
1061 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1128 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1062 else 1129 else
1063 sb->dev_roles[i] = cpu_to_le16(0xffff); 1130 sb->dev_roles[i] = cpu_to_le16(0xffff);
@@ -1115,6 +1182,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1115{ 1182{
1116 mdk_rdev_t *same_pdev; 1183 mdk_rdev_t *same_pdev;
1117 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1184 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1185 struct kobject *ko;
1118 1186
1119 if (rdev->mddev) { 1187 if (rdev->mddev) {
1120 MD_BUG(); 1188 MD_BUG();
@@ -1143,10 +1211,22 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1143 if (find_rdev_nr(mddev, rdev->desc_nr)) 1211 if (find_rdev_nr(mddev, rdev->desc_nr))
1144 return -EBUSY; 1212 return -EBUSY;
1145 } 1213 }
1214 bdevname(rdev->bdev,b);
1215 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1216 return -ENOMEM;
1146 1217
1147 list_add(&rdev->same_set, &mddev->disks); 1218 list_add(&rdev->same_set, &mddev->disks);
1148 rdev->mddev = mddev; 1219 rdev->mddev = mddev;
1149 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); 1220 printk(KERN_INFO "md: bind<%s>\n", b);
1221
1222 rdev->kobj.parent = &mddev->kobj;
1223 kobject_add(&rdev->kobj);
1224
1225 if (rdev->bdev->bd_part)
1226 ko = &rdev->bdev->bd_part->kobj;
1227 else
1228 ko = &rdev->bdev->bd_disk->kobj;
1229 sysfs_create_link(&rdev->kobj, ko, "block");
1150 return 0; 1230 return 0;
1151} 1231}
1152 1232
@@ -1160,6 +1240,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1160 list_del_init(&rdev->same_set); 1240 list_del_init(&rdev->same_set);
1161 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1241 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1162 rdev->mddev = NULL; 1242 rdev->mddev = NULL;
1243 sysfs_remove_link(&rdev->kobj, "block");
1244 kobject_del(&rdev->kobj);
1163} 1245}
1164 1246
1165/* 1247/*
@@ -1215,7 +1297,7 @@ static void export_rdev(mdk_rdev_t * rdev)
1215 md_autodetect_dev(rdev->bdev->bd_dev); 1297 md_autodetect_dev(rdev->bdev->bd_dev);
1216#endif 1298#endif
1217 unlock_rdev(rdev); 1299 unlock_rdev(rdev);
1218 kfree(rdev); 1300 kobject_put(&rdev->kobj);
1219} 1301}
1220 1302
1221static void kick_rdev_from_array(mdk_rdev_t * rdev) 1303static void kick_rdev_from_array(mdk_rdev_t * rdev)
@@ -1287,7 +1369,8 @@ static void print_rdev(mdk_rdev_t *rdev)
1287 char b[BDEVNAME_SIZE]; 1369 char b[BDEVNAME_SIZE];
1288 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1370 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1289 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1371 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1290 rdev->faulty, rdev->in_sync, rdev->desc_nr); 1372 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1373 rdev->desc_nr);
1291 if (rdev->sb_loaded) { 1374 if (rdev->sb_loaded) {
1292 printk(KERN_INFO "md: rdev superblock:\n"); 1375 printk(KERN_INFO "md: rdev superblock:\n");
1293 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1376 print_sb((mdp_super_t*)page_address(rdev->sb_page));
@@ -1344,7 +1427,7 @@ static void md_update_sb(mddev_t * mddev)
1344 int sync_req; 1427 int sync_req;
1345 1428
1346repeat: 1429repeat:
1347 spin_lock(&mddev->write_lock); 1430 spin_lock_irq(&mddev->write_lock);
1348 sync_req = mddev->in_sync; 1431 sync_req = mddev->in_sync;
1349 mddev->utime = get_seconds(); 1432 mddev->utime = get_seconds();
1350 mddev->events ++; 1433 mddev->events ++;
@@ -1367,11 +1450,11 @@ repeat:
1367 */ 1450 */
1368 if (!mddev->persistent) { 1451 if (!mddev->persistent) {
1369 mddev->sb_dirty = 0; 1452 mddev->sb_dirty = 0;
1370 spin_unlock(&mddev->write_lock); 1453 spin_unlock_irq(&mddev->write_lock);
1371 wake_up(&mddev->sb_wait); 1454 wake_up(&mddev->sb_wait);
1372 return; 1455 return;
1373 } 1456 }
1374 spin_unlock(&mddev->write_lock); 1457 spin_unlock_irq(&mddev->write_lock);
1375 1458
1376 dprintk(KERN_INFO 1459 dprintk(KERN_INFO
1377 "md: updating %s RAID superblock on device (in sync %d)\n", 1460 "md: updating %s RAID superblock on device (in sync %d)\n",
@@ -1381,11 +1464,11 @@ repeat:
1381 ITERATE_RDEV(mddev,rdev,tmp) { 1464 ITERATE_RDEV(mddev,rdev,tmp) {
1382 char b[BDEVNAME_SIZE]; 1465 char b[BDEVNAME_SIZE];
1383 dprintk(KERN_INFO "md: "); 1466 dprintk(KERN_INFO "md: ");
1384 if (rdev->faulty) 1467 if (test_bit(Faulty, &rdev->flags))
1385 dprintk("(skipping faulty "); 1468 dprintk("(skipping faulty ");
1386 1469
1387 dprintk("%s ", bdevname(rdev->bdev,b)); 1470 dprintk("%s ", bdevname(rdev->bdev,b));
1388 if (!rdev->faulty) { 1471 if (!test_bit(Faulty, &rdev->flags)) {
1389 md_super_write(mddev,rdev, 1472 md_super_write(mddev,rdev,
1390 rdev->sb_offset<<1, rdev->sb_size, 1473 rdev->sb_offset<<1, rdev->sb_size,
1391 rdev->sb_page); 1474 rdev->sb_page);
@@ -1399,21 +1482,106 @@ repeat:
1399 /* only need to write one superblock... */ 1482 /* only need to write one superblock... */
1400 break; 1483 break;
1401 } 1484 }
1402 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1485 md_super_wait(mddev);
1403 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1486 /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1404 1487
1405 spin_lock(&mddev->write_lock); 1488 spin_lock_irq(&mddev->write_lock);
1406 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1489 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1407 /* have to write it out again */ 1490 /* have to write it out again */
1408 spin_unlock(&mddev->write_lock); 1491 spin_unlock_irq(&mddev->write_lock);
1409 goto repeat; 1492 goto repeat;
1410 } 1493 }
1411 mddev->sb_dirty = 0; 1494 mddev->sb_dirty = 0;
1412 spin_unlock(&mddev->write_lock); 1495 spin_unlock_irq(&mddev->write_lock);
1413 wake_up(&mddev->sb_wait); 1496 wake_up(&mddev->sb_wait);
1414 1497
1415} 1498}
1416 1499
1500struct rdev_sysfs_entry {
1501 struct attribute attr;
1502 ssize_t (*show)(mdk_rdev_t *, char *);
1503 ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1504};
1505
1506static ssize_t
1507state_show(mdk_rdev_t *rdev, char *page)
1508{
1509 char *sep = "";
1510 int len=0;
1511
1512 if (test_bit(Faulty, &rdev->flags)) {
1513 len+= sprintf(page+len, "%sfaulty",sep);
1514 sep = ",";
1515 }
1516 if (test_bit(In_sync, &rdev->flags)) {
1517 len += sprintf(page+len, "%sin_sync",sep);
1518 sep = ",";
1519 }
1520 if (!test_bit(Faulty, &rdev->flags) &&
1521 !test_bit(In_sync, &rdev->flags)) {
1522 len += sprintf(page+len, "%sspare", sep);
1523 sep = ",";
1524 }
1525 return len+sprintf(page+len, "\n");
1526}
1527
1528static struct rdev_sysfs_entry
1529rdev_state = __ATTR_RO(state);
1530
1531static ssize_t
1532super_show(mdk_rdev_t *rdev, char *page)
1533{
1534 if (rdev->sb_loaded && rdev->sb_size) {
1535 memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1536 return rdev->sb_size;
1537 } else
1538 return 0;
1539}
1540static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1541
1542static struct attribute *rdev_default_attrs[] = {
1543 &rdev_state.attr,
1544 &rdev_super.attr,
1545 NULL,
1546};
1547static ssize_t
1548rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1549{
1550 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1551 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1552
1553 if (!entry->show)
1554 return -EIO;
1555 return entry->show(rdev, page);
1556}
1557
1558static ssize_t
1559rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1560 const char *page, size_t length)
1561{
1562 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1563 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1564
1565 if (!entry->store)
1566 return -EIO;
1567 return entry->store(rdev, page, length);
1568}
1569
1570static void rdev_free(struct kobject *ko)
1571{
1572 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1573 kfree(rdev);
1574}
1575static struct sysfs_ops rdev_sysfs_ops = {
1576 .show = rdev_attr_show,
1577 .store = rdev_attr_store,
1578};
1579static struct kobj_type rdev_ktype = {
1580 .release = rdev_free,
1581 .sysfs_ops = &rdev_sysfs_ops,
1582 .default_attrs = rdev_default_attrs,
1583};
1584
1417/* 1585/*
1418 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1586 * Import a device. If 'super_format' >= 0, then sanity check the superblock
1419 * 1587 *
@@ -1445,11 +1613,15 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1445 if (err) 1613 if (err)
1446 goto abort_free; 1614 goto abort_free;
1447 1615
1616 rdev->kobj.parent = NULL;
1617 rdev->kobj.ktype = &rdev_ktype;
1618 kobject_init(&rdev->kobj);
1619
1448 rdev->desc_nr = -1; 1620 rdev->desc_nr = -1;
1449 rdev->faulty = 0; 1621 rdev->flags = 0;
1450 rdev->in_sync = 0;
1451 rdev->data_offset = 0; 1622 rdev->data_offset = 0;
1452 atomic_set(&rdev->nr_pending, 0); 1623 atomic_set(&rdev->nr_pending, 0);
1624 atomic_set(&rdev->read_errors, 0);
1453 1625
1454 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1626 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1455 if (!size) { 1627 if (!size) {
@@ -1537,7 +1709,7 @@ static void analyze_sbs(mddev_t * mddev)
1537 if (mddev->level == LEVEL_MULTIPATH) { 1709 if (mddev->level == LEVEL_MULTIPATH) {
1538 rdev->desc_nr = i++; 1710 rdev->desc_nr = i++;
1539 rdev->raid_disk = rdev->desc_nr; 1711 rdev->raid_disk = rdev->desc_nr;
1540 rdev->in_sync = 1; 1712 set_bit(In_sync, &rdev->flags);
1541 } 1713 }
1542 } 1714 }
1543 1715
@@ -1551,6 +1723,162 @@ static void analyze_sbs(mddev_t * mddev)
1551 1723
1552} 1724}
1553 1725
1726static ssize_t
1727level_show(mddev_t *mddev, char *page)
1728{
1729 mdk_personality_t *p = mddev->pers;
1730 if (p == NULL && mddev->raid_disks == 0)
1731 return 0;
1732 if (mddev->level >= 0)
1733 return sprintf(page, "RAID-%d\n", mddev->level);
1734 else
1735 return sprintf(page, "%s\n", p->name);
1736}
1737
1738static struct md_sysfs_entry md_level = __ATTR_RO(level);
1739
1740static ssize_t
1741raid_disks_show(mddev_t *mddev, char *page)
1742{
1743 if (mddev->raid_disks == 0)
1744 return 0;
1745 return sprintf(page, "%d\n", mddev->raid_disks);
1746}
1747
1748static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks);
1749
1750static ssize_t
1751action_show(mddev_t *mddev, char *page)
1752{
1753 char *type = "idle";
1754 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1755 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
1756 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1757 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1758 type = "resync";
1759 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1760 type = "check";
1761 else
1762 type = "repair";
1763 } else
1764 type = "recover";
1765 }
1766 return sprintf(page, "%s\n", type);
1767}
1768
1769static ssize_t
1770action_store(mddev_t *mddev, const char *page, size_t len)
1771{
1772 if (!mddev->pers || !mddev->pers->sync_request)
1773 return -EINVAL;
1774
1775 if (strcmp(page, "idle")==0 || strcmp(page, "idle\n")==0) {
1776 if (mddev->sync_thread) {
1777 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1778 md_unregister_thread(mddev->sync_thread);
1779 mddev->sync_thread = NULL;
1780 mddev->recovery = 0;
1781 }
1782 return len;
1783 }
1784
1785 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1786 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
1787 return -EBUSY;
1788 if (strcmp(page, "resync")==0 || strcmp(page, "resync\n")==0 ||
1789 strcmp(page, "recover")==0 || strcmp(page, "recover\n")==0)
1790 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1791 else {
1792 if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0)
1793 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
1794 else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0)
1795 return -EINVAL;
1796 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
1797 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
1798 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1799 }
1800 md_wakeup_thread(mddev->thread);
1801 return len;
1802}
1803
1804static ssize_t
1805mismatch_cnt_show(mddev_t *mddev, char *page)
1806{
1807 return sprintf(page, "%llu\n",
1808 (unsigned long long) mddev->resync_mismatches);
1809}
1810
1811static struct md_sysfs_entry
1812md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
1813
1814
1815static struct md_sysfs_entry
1816md_mismatches = __ATTR_RO(mismatch_cnt);
1817
1818static struct attribute *md_default_attrs[] = {
1819 &md_level.attr,
1820 &md_raid_disks.attr,
1821 NULL,
1822};
1823
1824static struct attribute *md_redundancy_attrs[] = {
1825 &md_scan_mode.attr,
1826 &md_mismatches.attr,
1827 NULL,
1828};
1829static struct attribute_group md_redundancy_group = {
1830 .name = NULL,
1831 .attrs = md_redundancy_attrs,
1832};
1833
1834
1835static ssize_t
1836md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1837{
1838 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
1839 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
1840 ssize_t rv;
1841
1842 if (!entry->show)
1843 return -EIO;
1844 mddev_lock(mddev);
1845 rv = entry->show(mddev, page);
1846 mddev_unlock(mddev);
1847 return rv;
1848}
1849
1850static ssize_t
1851md_attr_store(struct kobject *kobj, struct attribute *attr,
1852 const char *page, size_t length)
1853{
1854 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
1855 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
1856 ssize_t rv;
1857
1858 if (!entry->store)
1859 return -EIO;
1860 mddev_lock(mddev);
1861 rv = entry->store(mddev, page, length);
1862 mddev_unlock(mddev);
1863 return rv;
1864}
1865
1866static void md_free(struct kobject *ko)
1867{
1868 mddev_t *mddev = container_of(ko, mddev_t, kobj);
1869 kfree(mddev);
1870}
1871
1872static struct sysfs_ops md_sysfs_ops = {
1873 .show = md_attr_show,
1874 .store = md_attr_store,
1875};
1876static struct kobj_type md_ktype = {
1877 .release = md_free,
1878 .sysfs_ops = &md_sysfs_ops,
1879 .default_attrs = md_default_attrs,
1880};
1881
1554int mdp_major = 0; 1882int mdp_major = 0;
1555 1883
1556static struct kobject *md_probe(dev_t dev, int *part, void *data) 1884static struct kobject *md_probe(dev_t dev, int *part, void *data)
@@ -1592,6 +1920,11 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
1592 add_disk(disk); 1920 add_disk(disk);
1593 mddev->gendisk = disk; 1921 mddev->gendisk = disk;
1594 up(&disks_sem); 1922 up(&disks_sem);
1923 mddev->kobj.parent = &disk->kobj;
1924 mddev->kobj.k_name = NULL;
1925 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
1926 mddev->kobj.ktype = &md_ktype;
1927 kobject_register(&mddev->kobj);
1595 return NULL; 1928 return NULL;
1596} 1929}
1597 1930
@@ -1663,7 +1996,7 @@ static int do_md_run(mddev_t * mddev)
1663 1996
1664 /* devices must have minimum size of one chunk */ 1997 /* devices must have minimum size of one chunk */
1665 ITERATE_RDEV(mddev,rdev,tmp) { 1998 ITERATE_RDEV(mddev,rdev,tmp) {
1666 if (rdev->faulty) 1999 if (test_bit(Faulty, &rdev->flags))
1667 continue; 2000 continue;
1668 if (rdev->size < chunk_size / 1024) { 2001 if (rdev->size < chunk_size / 1024) {
1669 printk(KERN_WARNING 2002 printk(KERN_WARNING
@@ -1691,7 +2024,7 @@ static int do_md_run(mddev_t * mddev)
1691 * Also find largest hardsector size 2024 * Also find largest hardsector size
1692 */ 2025 */
1693 ITERATE_RDEV(mddev,rdev,tmp) { 2026 ITERATE_RDEV(mddev,rdev,tmp) {
1694 if (rdev->faulty) 2027 if (test_bit(Faulty, &rdev->flags))
1695 continue; 2028 continue;
1696 sync_blockdev(rdev->bdev); 2029 sync_blockdev(rdev->bdev);
1697 invalidate_bdev(rdev->bdev, 0); 2030 invalidate_bdev(rdev->bdev, 0);
@@ -1715,6 +2048,10 @@ static int do_md_run(mddev_t * mddev)
1715 2048
1716 mddev->recovery = 0; 2049 mddev->recovery = 0;
1717 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2050 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
2051 mddev->barriers_work = 1;
2052
2053 if (start_readonly)
2054 mddev->ro = 2; /* read-only, but switch on first write */
1718 2055
1719 /* before we start the array running, initialise the bitmap */ 2056 /* before we start the array running, initialise the bitmap */
1720 err = bitmap_create(mddev); 2057 err = bitmap_create(mddev);
@@ -1730,12 +2067,24 @@ static int do_md_run(mddev_t * mddev)
1730 bitmap_destroy(mddev); 2067 bitmap_destroy(mddev);
1731 return err; 2068 return err;
1732 } 2069 }
2070 if (mddev->pers->sync_request)
2071 sysfs_create_group(&mddev->kobj, &md_redundancy_group);
2072 else if (mddev->ro == 2) /* auto-readonly not meaningful */
2073 mddev->ro = 0;
2074
1733 atomic_set(&mddev->writes_pending,0); 2075 atomic_set(&mddev->writes_pending,0);
1734 mddev->safemode = 0; 2076 mddev->safemode = 0;
1735 mddev->safemode_timer.function = md_safemode_timeout; 2077 mddev->safemode_timer.function = md_safemode_timeout;
1736 mddev->safemode_timer.data = (unsigned long) mddev; 2078 mddev->safemode_timer.data = (unsigned long) mddev;
1737 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 2079 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
1738 mddev->in_sync = 1; 2080 mddev->in_sync = 1;
2081
2082 ITERATE_RDEV(mddev,rdev,tmp)
2083 if (rdev->raid_disk >= 0) {
2084 char nm[20];
2085 sprintf(nm, "rd%d", rdev->raid_disk);
2086 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
2087 }
1739 2088
1740 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2089 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1741 md_wakeup_thread(mddev->thread); 2090 md_wakeup_thread(mddev->thread);
@@ -1821,16 +2170,19 @@ static int do_md_stop(mddev_t * mddev, int ro)
1821 2170
1822 if (ro) { 2171 if (ro) {
1823 err = -ENXIO; 2172 err = -ENXIO;
1824 if (mddev->ro) 2173 if (mddev->ro==1)
1825 goto out; 2174 goto out;
1826 mddev->ro = 1; 2175 mddev->ro = 1;
1827 } else { 2176 } else {
1828 bitmap_flush(mddev); 2177 bitmap_flush(mddev);
1829 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 2178 md_super_wait(mddev);
1830 if (mddev->ro) 2179 if (mddev->ro)
1831 set_disk_ro(disk, 0); 2180 set_disk_ro(disk, 0);
1832 blk_queue_make_request(mddev->queue, md_fail_request); 2181 blk_queue_make_request(mddev->queue, md_fail_request);
1833 mddev->pers->stop(mddev); 2182 mddev->pers->stop(mddev);
2183 if (mddev->pers->sync_request)
2184 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
2185
1834 module_put(mddev->pers->owner); 2186 module_put(mddev->pers->owner);
1835 mddev->pers = NULL; 2187 mddev->pers = NULL;
1836 if (mddev->ro) 2188 if (mddev->ro)
@@ -1857,9 +2209,18 @@ static int do_md_stop(mddev_t * mddev, int ro)
1857 * Free resources if final stop 2209 * Free resources if final stop
1858 */ 2210 */
1859 if (!ro) { 2211 if (!ro) {
2212 mdk_rdev_t *rdev;
2213 struct list_head *tmp;
1860 struct gendisk *disk; 2214 struct gendisk *disk;
1861 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 2215 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
1862 2216
2217 ITERATE_RDEV(mddev,rdev,tmp)
2218 if (rdev->raid_disk >= 0) {
2219 char nm[20];
2220 sprintf(nm, "rd%d", rdev->raid_disk);
2221 sysfs_remove_link(&mddev->kobj, nm);
2222 }
2223
1863 export_array(mddev); 2224 export_array(mddev);
1864 2225
1865 mddev->array_size = 0; 2226 mddev->array_size = 0;
@@ -2012,7 +2373,7 @@ static int autostart_array(dev_t startdev)
2012 return err; 2373 return err;
2013 } 2374 }
2014 2375
2015 if (start_rdev->faulty) { 2376 if (test_bit(Faulty, &start_rdev->flags)) {
2016 printk(KERN_WARNING 2377 printk(KERN_WARNING
2017 "md: can not autostart based on faulty %s!\n", 2378 "md: can not autostart based on faulty %s!\n",
2018 bdevname(start_rdev->bdev,b)); 2379 bdevname(start_rdev->bdev,b));
@@ -2071,11 +2432,11 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
2071 nr=working=active=failed=spare=0; 2432 nr=working=active=failed=spare=0;
2072 ITERATE_RDEV(mddev,rdev,tmp) { 2433 ITERATE_RDEV(mddev,rdev,tmp) {
2073 nr++; 2434 nr++;
2074 if (rdev->faulty) 2435 if (test_bit(Faulty, &rdev->flags))
2075 failed++; 2436 failed++;
2076 else { 2437 else {
2077 working++; 2438 working++;
2078 if (rdev->in_sync) 2439 if (test_bit(In_sync, &rdev->flags))
2079 active++; 2440 active++;
2080 else 2441 else
2081 spare++; 2442 spare++;
@@ -2166,9 +2527,9 @@ static int get_disk_info(mddev_t * mddev, void __user * arg)
2166 info.minor = MINOR(rdev->bdev->bd_dev); 2527 info.minor = MINOR(rdev->bdev->bd_dev);
2167 info.raid_disk = rdev->raid_disk; 2528 info.raid_disk = rdev->raid_disk;
2168 info.state = 0; 2529 info.state = 0;
2169 if (rdev->faulty) 2530 if (test_bit(Faulty, &rdev->flags))
2170 info.state |= (1<<MD_DISK_FAULTY); 2531 info.state |= (1<<MD_DISK_FAULTY);
2171 else if (rdev->in_sync) { 2532 else if (test_bit(In_sync, &rdev->flags)) {
2172 info.state |= (1<<MD_DISK_ACTIVE); 2533 info.state |= (1<<MD_DISK_ACTIVE);
2173 info.state |= (1<<MD_DISK_SYNC); 2534 info.state |= (1<<MD_DISK_SYNC);
2174 } 2535 }
@@ -2261,7 +2622,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2261 validate_super(mddev, rdev); 2622 validate_super(mddev, rdev);
2262 rdev->saved_raid_disk = rdev->raid_disk; 2623 rdev->saved_raid_disk = rdev->raid_disk;
2263 2624
2264 rdev->in_sync = 0; /* just to be sure */ 2625 clear_bit(In_sync, &rdev->flags); /* just to be sure */
2265 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2626 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2266 set_bit(WriteMostly, &rdev->flags); 2627 set_bit(WriteMostly, &rdev->flags);
2267 2628
@@ -2299,11 +2660,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2299 else 2660 else
2300 rdev->raid_disk = -1; 2661 rdev->raid_disk = -1;
2301 2662
2302 rdev->faulty = 0; 2663 rdev->flags = 0;
2664
2303 if (rdev->raid_disk < mddev->raid_disks) 2665 if (rdev->raid_disk < mddev->raid_disks)
2304 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); 2666 if (info->state & (1<<MD_DISK_SYNC))
2305 else 2667 set_bit(In_sync, &rdev->flags);
2306 rdev->in_sync = 0;
2307 2668
2308 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 2669 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2309 set_bit(WriteMostly, &rdev->flags); 2670 set_bit(WriteMostly, &rdev->flags);
@@ -2402,14 +2763,14 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2402 goto abort_export; 2763 goto abort_export;
2403 } 2764 }
2404 2765
2405 if (rdev->faulty) { 2766 if (test_bit(Faulty, &rdev->flags)) {
2406 printk(KERN_WARNING 2767 printk(KERN_WARNING
2407 "md: can not hot-add faulty %s disk to %s!\n", 2768 "md: can not hot-add faulty %s disk to %s!\n",
2408 bdevname(rdev->bdev,b), mdname(mddev)); 2769 bdevname(rdev->bdev,b), mdname(mddev));
2409 err = -EINVAL; 2770 err = -EINVAL;
2410 goto abort_export; 2771 goto abort_export;
2411 } 2772 }
2412 rdev->in_sync = 0; 2773 clear_bit(In_sync, &rdev->flags);
2413 rdev->desc_nr = -1; 2774 rdev->desc_nr = -1;
2414 bind_rdev_to_array(rdev, mddev); 2775 bind_rdev_to_array(rdev, mddev);
2415 2776
@@ -2929,12 +3290,22 @@ static int md_ioctl(struct inode *inode, struct file *file,
2929 3290
2930 /* 3291 /*
2931 * The remaining ioctls are changing the state of the 3292 * The remaining ioctls are changing the state of the
2932 * superblock, so we do not allow read-only arrays 3293 * superblock, so we do not allow them on read-only arrays.
2933 * here: 3294 * However non-MD ioctls (e.g. get-size) will still come through
3295 * here and hit the 'default' below, so only disallow
3296 * 'md' ioctls, and switch to rw mode if started auto-readonly.
2934 */ 3297 */
2935 if (mddev->ro) { 3298 if (_IOC_TYPE(cmd) == MD_MAJOR &&
2936 err = -EROFS; 3299 mddev->ro && mddev->pers) {
2937 goto abort_unlock; 3300 if (mddev->ro == 2) {
3301 mddev->ro = 0;
3302 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3303 md_wakeup_thread(mddev->thread);
3304
3305 } else {
3306 err = -EROFS;
3307 goto abort_unlock;
3308 }
2938 } 3309 }
2939 3310
2940 switch (cmd) 3311 switch (cmd)
@@ -3064,21 +3435,17 @@ static int md_thread(void * arg)
3064 */ 3435 */
3065 3436
3066 allow_signal(SIGKILL); 3437 allow_signal(SIGKILL);
3067 complete(thread->event);
3068 while (!kthread_should_stop()) { 3438 while (!kthread_should_stop()) {
3069 void (*run)(mddev_t *);
3070 3439
3071 wait_event_interruptible_timeout(thread->wqueue, 3440 wait_event_timeout(thread->wqueue,
3072 test_bit(THREAD_WAKEUP, &thread->flags) 3441 test_bit(THREAD_WAKEUP, &thread->flags)
3073 || kthread_should_stop(), 3442 || kthread_should_stop(),
3074 thread->timeout); 3443 thread->timeout);
3075 try_to_freeze(); 3444 try_to_freeze();
3076 3445
3077 clear_bit(THREAD_WAKEUP, &thread->flags); 3446 clear_bit(THREAD_WAKEUP, &thread->flags);
3078 3447
3079 run = thread->run; 3448 thread->run(thread->mddev);
3080 if (run)
3081 run(thread->mddev);
3082 } 3449 }
3083 3450
3084 return 0; 3451 return 0;
@@ -3097,7 +3464,6 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3097 const char *name) 3464 const char *name)
3098{ 3465{
3099 mdk_thread_t *thread; 3466 mdk_thread_t *thread;
3100 struct completion event;
3101 3467
3102 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3468 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL);
3103 if (!thread) 3469 if (!thread)
@@ -3106,18 +3472,14 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3106 memset(thread, 0, sizeof(mdk_thread_t)); 3472 memset(thread, 0, sizeof(mdk_thread_t));
3107 init_waitqueue_head(&thread->wqueue); 3473 init_waitqueue_head(&thread->wqueue);
3108 3474
3109 init_completion(&event);
3110 thread->event = &event;
3111 thread->run = run; 3475 thread->run = run;
3112 thread->mddev = mddev; 3476 thread->mddev = mddev;
3113 thread->name = name;
3114 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3477 thread->timeout = MAX_SCHEDULE_TIMEOUT;
3115 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 3478 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
3116 if (IS_ERR(thread->tsk)) { 3479 if (IS_ERR(thread->tsk)) {
3117 kfree(thread); 3480 kfree(thread);
3118 return NULL; 3481 return NULL;
3119 } 3482 }
3120 wait_for_completion(&event);
3121 return thread; 3483 return thread;
3122} 3484}
3123 3485
@@ -3136,7 +3498,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
3136 return; 3498 return;
3137 } 3499 }
3138 3500
3139 if (!rdev || rdev->faulty) 3501 if (!rdev || test_bit(Faulty, &rdev->flags))
3140 return; 3502 return;
3141/* 3503/*
3142 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3504 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
@@ -3322,8 +3684,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
3322 seq_printf(seq, "%s : %sactive", mdname(mddev), 3684 seq_printf(seq, "%s : %sactive", mdname(mddev),
3323 mddev->pers ? "" : "in"); 3685 mddev->pers ? "" : "in");
3324 if (mddev->pers) { 3686 if (mddev->pers) {
3325 if (mddev->ro) 3687 if (mddev->ro==1)
3326 seq_printf(seq, " (read-only)"); 3688 seq_printf(seq, " (read-only)");
3689 if (mddev->ro==2)
3690 seq_printf(seq, "(auto-read-only)");
3327 seq_printf(seq, " %s", mddev->pers->name); 3691 seq_printf(seq, " %s", mddev->pers->name);
3328 } 3692 }
3329 3693
@@ -3334,7 +3698,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
3334 bdevname(rdev->bdev,b), rdev->desc_nr); 3698 bdevname(rdev->bdev,b), rdev->desc_nr);
3335 if (test_bit(WriteMostly, &rdev->flags)) 3699 if (test_bit(WriteMostly, &rdev->flags))
3336 seq_printf(seq, "(W)"); 3700 seq_printf(seq, "(W)");
3337 if (rdev->faulty) { 3701 if (test_bit(Faulty, &rdev->flags)) {
3338 seq_printf(seq, "(F)"); 3702 seq_printf(seq, "(F)");
3339 continue; 3703 continue;
3340 } else if (rdev->raid_disk < 0) 3704 } else if (rdev->raid_disk < 0)
@@ -3363,11 +3727,15 @@ static int md_seq_show(struct seq_file *seq, void *v)
3363 if (mddev->pers) { 3727 if (mddev->pers) {
3364 mddev->pers->status (seq, mddev); 3728 mddev->pers->status (seq, mddev);
3365 seq_printf(seq, "\n "); 3729 seq_printf(seq, "\n ");
3366 if (mddev->curr_resync > 2) { 3730 if (mddev->pers->sync_request) {
3367 status_resync (seq, mddev); 3731 if (mddev->curr_resync > 2) {
3368 seq_printf(seq, "\n "); 3732 status_resync (seq, mddev);
3369 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3733 seq_printf(seq, "\n ");
3370 seq_printf(seq, " resync=DELAYED\n "); 3734 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3735 seq_printf(seq, "\tresync=DELAYED\n ");
3736 else if (mddev->recovery_cp < MaxSector)
3737 seq_printf(seq, "\tresync=PENDING\n ");
3738 }
3371 } else 3739 } else
3372 seq_printf(seq, "\n "); 3740 seq_printf(seq, "\n ");
3373 3741
@@ -3504,15 +3872,22 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
3504 if (bio_data_dir(bi) != WRITE) 3872 if (bio_data_dir(bi) != WRITE)
3505 return; 3873 return;
3506 3874
3875 BUG_ON(mddev->ro == 1);
3876 if (mddev->ro == 2) {
3877 /* need to switch to read/write */
3878 mddev->ro = 0;
3879 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3880 md_wakeup_thread(mddev->thread);
3881 }
3507 atomic_inc(&mddev->writes_pending); 3882 atomic_inc(&mddev->writes_pending);
3508 if (mddev->in_sync) { 3883 if (mddev->in_sync) {
3509 spin_lock(&mddev->write_lock); 3884 spin_lock_irq(&mddev->write_lock);
3510 if (mddev->in_sync) { 3885 if (mddev->in_sync) {
3511 mddev->in_sync = 0; 3886 mddev->in_sync = 0;
3512 mddev->sb_dirty = 1; 3887 mddev->sb_dirty = 1;
3513 md_wakeup_thread(mddev->thread); 3888 md_wakeup_thread(mddev->thread);
3514 } 3889 }
3515 spin_unlock(&mddev->write_lock); 3890 spin_unlock_irq(&mddev->write_lock);
3516 } 3891 }
3517 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3892 wait_event(mddev->sb_wait, mddev->sb_dirty==0);
3518} 3893}
@@ -3568,9 +3943,7 @@ static void md_do_sync(mddev_t *mddev)
3568 mddev->curr_resync = 2; 3943 mddev->curr_resync = 2;
3569 3944
3570 try_again: 3945 try_again:
3571 if (signal_pending(current) || 3946 if (kthread_should_stop()) {
3572 kthread_should_stop()) {
3573 flush_signals(current);
3574 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3947 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3575 goto skip; 3948 goto skip;
3576 } 3949 }
@@ -3590,9 +3963,8 @@ static void md_do_sync(mddev_t *mddev)
3590 * time 'round when curr_resync == 2 3963 * time 'round when curr_resync == 2
3591 */ 3964 */
3592 continue; 3965 continue;
3593 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 3966 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
3594 if (!signal_pending(current) && 3967 if (!kthread_should_stop() &&
3595 !kthread_should_stop() &&
3596 mddev2->curr_resync >= mddev->curr_resync) { 3968 mddev2->curr_resync >= mddev->curr_resync) {
3597 printk(KERN_INFO "md: delaying resync of %s" 3969 printk(KERN_INFO "md: delaying resync of %s"
3598 " until %s has finished resync (they" 3970 " until %s has finished resync (they"
@@ -3608,12 +3980,13 @@ static void md_do_sync(mddev_t *mddev)
3608 } 3980 }
3609 } while (mddev->curr_resync < 2); 3981 } while (mddev->curr_resync < 2);
3610 3982
3611 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3983 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3612 /* resync follows the size requested by the personality, 3984 /* resync follows the size requested by the personality,
3613 * which defaults to physical size, but can be virtual size 3985 * which defaults to physical size, but can be virtual size
3614 */ 3986 */
3615 max_sectors = mddev->resync_max_sectors; 3987 max_sectors = mddev->resync_max_sectors;
3616 else 3988 mddev->resync_mismatches = 0;
3989 } else
3617 /* recovery follows the physical size of devices */ 3990 /* recovery follows the physical size of devices */
3618 max_sectors = mddev->size << 1; 3991 max_sectors = mddev->size << 1;
3619 3992
@@ -3626,7 +3999,8 @@ static void md_do_sync(mddev_t *mddev)
3626 3999
3627 is_mddev_idle(mddev); /* this also initializes IO event counters */ 4000 is_mddev_idle(mddev); /* this also initializes IO event counters */
3628 /* we don't use the checkpoint if there's a bitmap */ 4001 /* we don't use the checkpoint if there's a bitmap */
3629 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap) 4002 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
4003 && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3630 j = mddev->recovery_cp; 4004 j = mddev->recovery_cp;
3631 else 4005 else
3632 j = 0; 4006 j = 0;
@@ -3699,13 +4073,12 @@ static void md_do_sync(mddev_t *mddev)
3699 } 4073 }
3700 4074
3701 4075
3702 if (signal_pending(current) || kthread_should_stop()) { 4076 if (kthread_should_stop()) {
3703 /* 4077 /*
3704 * got a signal, exit. 4078 * got a signal, exit.
3705 */ 4079 */
3706 printk(KERN_INFO 4080 printk(KERN_INFO
3707 "md: md_do_sync() got signal ... exiting\n"); 4081 "md: md_do_sync() got signal ... exiting\n");
3708 flush_signals(current);
3709 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4082 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3710 goto out; 4083 goto out;
3711 } 4084 }
@@ -3727,7 +4100,7 @@ static void md_do_sync(mddev_t *mddev)
3727 if (currspeed > sysctl_speed_limit_min) { 4100 if (currspeed > sysctl_speed_limit_min) {
3728 if ((currspeed > sysctl_speed_limit_max) || 4101 if ((currspeed > sysctl_speed_limit_max) ||
3729 !is_mddev_idle(mddev)) { 4102 !is_mddev_idle(mddev)) {
3730 msleep_interruptible(250); 4103 msleep(250);
3731 goto repeat; 4104 goto repeat;
3732 } 4105 }
3733 } 4106 }
@@ -3820,7 +4193,7 @@ void md_check_recovery(mddev_t *mddev)
3820 if (mddev_trylock(mddev)==0) { 4193 if (mddev_trylock(mddev)==0) {
3821 int spares =0; 4194 int spares =0;
3822 4195
3823 spin_lock(&mddev->write_lock); 4196 spin_lock_irq(&mddev->write_lock);
3824 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4197 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3825 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4198 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3826 mddev->in_sync = 1; 4199 mddev->in_sync = 1;
@@ -3828,7 +4201,7 @@ void md_check_recovery(mddev_t *mddev)
3828 } 4201 }
3829 if (mddev->safemode == 1) 4202 if (mddev->safemode == 1)
3830 mddev->safemode = 0; 4203 mddev->safemode = 0;
3831 spin_unlock(&mddev->write_lock); 4204 spin_unlock_irq(&mddev->write_lock);
3832 4205
3833 if (mddev->sb_dirty) 4206 if (mddev->sb_dirty)
3834 md_update_sb(mddev); 4207 md_update_sb(mddev);
@@ -3864,9 +4237,13 @@ void md_check_recovery(mddev_t *mddev)
3864 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4237 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3865 goto unlock; 4238 goto unlock;
3866 } 4239 }
3867 if (mddev->recovery) 4240 /* Clear some bits that don't mean anything, but
3868 /* probably just the RECOVERY_NEEDED flag */ 4241 * might be left set
3869 mddev->recovery = 0; 4242 */
4243 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4244 clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
4245 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
4246 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
3870 4247
3871 /* no recovery is running. 4248 /* no recovery is running.
3872 * remove any failed drives, then 4249 * remove any failed drives, then
@@ -3876,31 +4253,41 @@ void md_check_recovery(mddev_t *mddev)
3876 */ 4253 */
3877 ITERATE_RDEV(mddev,rdev,rtmp) 4254 ITERATE_RDEV(mddev,rdev,rtmp)
3878 if (rdev->raid_disk >= 0 && 4255 if (rdev->raid_disk >= 0 &&
3879 (rdev->faulty || ! rdev->in_sync) && 4256 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
3880 atomic_read(&rdev->nr_pending)==0) { 4257 atomic_read(&rdev->nr_pending)==0) {
3881 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) 4258 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
4259 char nm[20];
4260 sprintf(nm,"rd%d", rdev->raid_disk);
4261 sysfs_remove_link(&mddev->kobj, nm);
3882 rdev->raid_disk = -1; 4262 rdev->raid_disk = -1;
4263 }
3883 } 4264 }
3884 4265
3885 if (mddev->degraded) { 4266 if (mddev->degraded) {
3886 ITERATE_RDEV(mddev,rdev,rtmp) 4267 ITERATE_RDEV(mddev,rdev,rtmp)
3887 if (rdev->raid_disk < 0 4268 if (rdev->raid_disk < 0
3888 && !rdev->faulty) { 4269 && !test_bit(Faulty, &rdev->flags)) {
3889 if (mddev->pers->hot_add_disk(mddev,rdev)) 4270 if (mddev->pers->hot_add_disk(mddev,rdev)) {
4271 char nm[20];
4272 sprintf(nm, "rd%d", rdev->raid_disk);
4273 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
3890 spares++; 4274 spares++;
3891 else 4275 } else
3892 break; 4276 break;
3893 } 4277 }
3894 } 4278 }
3895 4279
3896 if (!spares && (mddev->recovery_cp == MaxSector )) { 4280 if (spares) {
3897 /* nothing we can do ... */ 4281 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4282 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4283 } else if (mddev->recovery_cp < MaxSector) {
4284 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4285 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4286 /* nothing to be done ... */
3898 goto unlock; 4287 goto unlock;
3899 } 4288
3900 if (mddev->pers->sync_request) { 4289 if (mddev->pers->sync_request) {
3901 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4290 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3902 if (!spares)
3903 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3904 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 4291 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
3905 /* We are adding a device or devices to an array 4292 /* We are adding a device or devices to an array
3906 * which has the bitmap stored on all devices. 4293 * which has the bitmap stored on all devices.
@@ -3975,7 +4362,7 @@ static int __init md_init(void)
3975 " MD_SB_DISKS=%d\n", 4362 " MD_SB_DISKS=%d\n",
3976 MD_MAJOR_VERSION, MD_MINOR_VERSION, 4363 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3977 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 4364 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3978 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, 4365 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
3979 BITMAP_MINOR); 4366 BITMAP_MINOR);
3980 4367
3981 if (register_blkdev(MAJOR_NR, "md")) 4368 if (register_blkdev(MAJOR_NR, "md"))
@@ -4039,7 +4426,7 @@ static void autostart_arrays(int part)
4039 if (IS_ERR(rdev)) 4426 if (IS_ERR(rdev))
4040 continue; 4427 continue;
4041 4428
4042 if (rdev->faulty) { 4429 if (test_bit(Faulty, &rdev->flags)) {
4043 MD_BUG(); 4430 MD_BUG();
4044 continue; 4431 continue;
4045 } 4432 }
@@ -4086,6 +4473,23 @@ static __exit void md_exit(void)
4086module_init(md_init) 4473module_init(md_init)
4087module_exit(md_exit) 4474module_exit(md_exit)
4088 4475
4476static int get_ro(char *buffer, struct kernel_param *kp)
4477{
4478 return sprintf(buffer, "%d", start_readonly);
4479}
4480static int set_ro(const char *val, struct kernel_param *kp)
4481{
4482 char *e;
4483 int num = simple_strtoul(val, &e, 10);
4484 if (*val && (*e == '\0' || *e == '\n')) {
4485 start_readonly = num;
4486 return 0;;
4487 }
4488 return -EINVAL;
4489}
4490
4491module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
4492
4089EXPORT_SYMBOL(register_md_personality); 4493EXPORT_SYMBOL(register_md_personality);
4090EXPORT_SYMBOL(unregister_md_personality); 4494EXPORT_SYMBOL(unregister_md_personality);
4091EXPORT_SYMBOL(md_error); 4495EXPORT_SYMBOL(md_error);