aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c615
1 files changed, 398 insertions, 217 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2580ac1b9b0f..c2ff77ccec50 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev)
169{ 169{
170 atomic_inc(&md_event_count); 170 atomic_inc(&md_event_count);
171 wake_up(&md_event_waiters); 171 wake_up(&md_event_waiters);
172 sysfs_notify(&mddev->kobj, NULL, "sync_action");
173} 172}
174EXPORT_SYMBOL_GPL(md_new_event); 173EXPORT_SYMBOL_GPL(md_new_event);
175 174
@@ -274,10 +273,12 @@ static mddev_t * mddev_find(dev_t unit)
274 INIT_LIST_HEAD(&new->all_mddevs); 273 INIT_LIST_HEAD(&new->all_mddevs);
275 init_timer(&new->safemode_timer); 274 init_timer(&new->safemode_timer);
276 atomic_set(&new->active, 1); 275 atomic_set(&new->active, 1);
276 atomic_set(&new->openers, 0);
277 spin_lock_init(&new->write_lock); 277 spin_lock_init(&new->write_lock);
278 init_waitqueue_head(&new->sb_wait); 278 init_waitqueue_head(&new->sb_wait);
279 init_waitqueue_head(&new->recovery_wait); 279 init_waitqueue_head(&new->recovery_wait);
280 new->reshape_position = MaxSector; 280 new->reshape_position = MaxSector;
281 new->resync_min = 0;
281 new->resync_max = MaxSector; 282 new->resync_max = MaxSector;
282 new->level = LEVEL_NONE; 283 new->level = LEVEL_NONE;
283 284
@@ -347,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel)
347 return NULL; 348 return NULL;
348} 349}
349 350
351/* return the offset of the super block in 512byte sectors */
350static inline sector_t calc_dev_sboffset(struct block_device *bdev) 352static inline sector_t calc_dev_sboffset(struct block_device *bdev)
351{ 353{
352 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 354 sector_t num_sectors = bdev->bd_inode->i_size / 512;
353 return MD_NEW_SIZE_BLOCKS(size); 355 return MD_NEW_SIZE_SECTORS(num_sectors);
354} 356}
355 357
356static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 358static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
357{ 359{
358 sector_t size; 360 sector_t num_sectors = rdev->sb_start;
359
360 size = rdev->sb_offset;
361 361
362 if (chunk_size) 362 if (chunk_size)
363 size &= ~((sector_t)chunk_size/1024 - 1); 363 num_sectors &= ~((sector_t)chunk_size/512 - 1);
364 return size; 364 return num_sectors;
365} 365}
366 366
367static int alloc_disk_sb(mdk_rdev_t * rdev) 367static int alloc_disk_sb(mdk_rdev_t * rdev)
@@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
372 rdev->sb_page = alloc_page(GFP_KERNEL); 372 rdev->sb_page = alloc_page(GFP_KERNEL);
373 if (!rdev->sb_page) { 373 if (!rdev->sb_page) {
374 printk(KERN_ALERT "md: out of memory.\n"); 374 printk(KERN_ALERT "md: out of memory.\n");
375 return -EINVAL; 375 return -ENOMEM;
376 } 376 }
377 377
378 return 0; 378 return 0;
@@ -384,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
384 put_page(rdev->sb_page); 384 put_page(rdev->sb_page);
385 rdev->sb_loaded = 0; 385 rdev->sb_loaded = 0;
386 rdev->sb_page = NULL; 386 rdev->sb_page = NULL;
387 rdev->sb_offset = 0; 387 rdev->sb_start = 0;
388 rdev->size = 0; 388 rdev->size = 0;
389 } 389 }
390} 390}
@@ -530,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
530 return 0; 530 return 0;
531 531
532 532
533 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 533 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
534 goto fail; 534 goto fail;
535 rdev->sb_loaded = 1; 535 rdev->sb_loaded = 1;
536 return 0; 536 return 0;
@@ -543,17 +543,12 @@ fail:
543 543
544static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 544static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
545{ 545{
546 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 546 return sb1->set_uuid0 == sb2->set_uuid0 &&
547 (sb1->set_uuid1 == sb2->set_uuid1) && 547 sb1->set_uuid1 == sb2->set_uuid1 &&
548 (sb1->set_uuid2 == sb2->set_uuid2) && 548 sb1->set_uuid2 == sb2->set_uuid2 &&
549 (sb1->set_uuid3 == sb2->set_uuid3)) 549 sb1->set_uuid3 == sb2->set_uuid3;
550
551 return 1;
552
553 return 0;
554} 550}
555 551
556
557static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 552static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
558{ 553{
559 int ret; 554 int ret;
@@ -564,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
564 559
565 if (!tmp1 || !tmp2) { 560 if (!tmp1 || !tmp2) {
566 ret = 0; 561 ret = 0;
567 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 562 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
568 goto abort; 563 goto abort;
569 } 564 }
570 565
@@ -577,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
577 tmp1->nr_disks = 0; 572 tmp1->nr_disks = 0;
578 tmp2->nr_disks = 0; 573 tmp2->nr_disks = 0;
579 574
580 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 575 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
581 ret = 0;
582 else
583 ret = 1;
584
585abort: 576abort:
586 kfree(tmp1); 577 kfree(tmp1);
587 kfree(tmp2); 578 kfree(tmp2);
@@ -658,11 +649,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
658 */ 649 */
659 650
660struct super_type { 651struct super_type {
661 char *name; 652 char *name;
662 struct module *owner; 653 struct module *owner;
663 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 654 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
664 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 655 int minor_version);
665 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 656 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
657 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
658 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
659 sector_t num_sectors);
666}; 660};
667 661
668/* 662/*
@@ -673,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
673 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 667 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
674 mdp_super_t *sb; 668 mdp_super_t *sb;
675 int ret; 669 int ret;
676 sector_t sb_offset;
677 670
678 /* 671 /*
679 * Calculate the position of the superblock, 672 * Calculate the position of the superblock (512byte sectors),
680 * it's at the end of the disk. 673 * it's at the end of the disk.
681 * 674 *
682 * It also happens to be a multiple of 4Kb. 675 * It also happens to be a multiple of 4Kb.
683 */ 676 */
684 sb_offset = calc_dev_sboffset(rdev->bdev); 677 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
685 rdev->sb_offset = sb_offset;
686 678
687 ret = read_disk_sb(rdev, MD_SB_BYTES); 679 ret = read_disk_sb(rdev, MD_SB_BYTES);
688 if (ret) return ret; 680 if (ret) return ret;
@@ -759,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
759 else 751 else
760 ret = 0; 752 ret = 0;
761 } 753 }
762 rdev->size = calc_dev_size(rdev, sb->chunk_size); 754 rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
763 755
764 if (rdev->size < sb->size && sb->level > 1) 756 if (rdev->size < sb->size && sb->level > 1)
765 /* "this cannot possibly happen" ... */ 757 /* "this cannot possibly happen" ... */
@@ -1004,6 +996,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1004} 996}
1005 997
1006/* 998/*
999 * rdev_size_change for 0.90.0
1000 */
1001static unsigned long long
1002super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1003{
1004 if (num_sectors && num_sectors < rdev->mddev->size * 2)
1005 return 0; /* component must fit device */
1006 if (rdev->mddev->bitmap_offset)
1007 return 0; /* can't move bitmap */
1008 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1009 if (!num_sectors || num_sectors > rdev->sb_start)
1010 num_sectors = rdev->sb_start;
1011 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1012 rdev->sb_page);
1013 md_super_wait(rdev->mddev);
1014 return num_sectors / 2; /* kB for sysfs */
1015}
1016
1017
1018/*
1007 * version 1 superblock 1019 * version 1 superblock
1008 */ 1020 */
1009 1021
@@ -1034,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1034{ 1046{
1035 struct mdp_superblock_1 *sb; 1047 struct mdp_superblock_1 *sb;
1036 int ret; 1048 int ret;
1037 sector_t sb_offset; 1049 sector_t sb_start;
1038 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1050 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1039 int bmask; 1051 int bmask;
1040 1052
1041 /* 1053 /*
1042 * Calculate the position of the superblock. 1054 * Calculate the position of the superblock in 512byte sectors.
1043 * It is always aligned to a 4K boundary and 1055 * It is always aligned to a 4K boundary and
1044 * depeding on minor_version, it can be: 1056 * depeding on minor_version, it can be:
1045 * 0: At least 8K, but less than 12K, from end of device 1057 * 0: At least 8K, but less than 12K, from end of device
@@ -1048,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1048 */ 1060 */
1049 switch(minor_version) { 1061 switch(minor_version) {
1050 case 0: 1062 case 0:
1051 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1063 sb_start = rdev->bdev->bd_inode->i_size >> 9;
1052 sb_offset -= 8*2; 1064 sb_start -= 8*2;
1053 sb_offset &= ~(sector_t)(4*2-1); 1065 sb_start &= ~(sector_t)(4*2-1);
1054 /* convert from sectors to K */
1055 sb_offset /= 2;
1056 break; 1066 break;
1057 case 1: 1067 case 1:
1058 sb_offset = 0; 1068 sb_start = 0;
1059 break; 1069 break;
1060 case 2: 1070 case 2:
1061 sb_offset = 4; 1071 sb_start = 8;
1062 break; 1072 break;
1063 default: 1073 default:
1064 return -EINVAL; 1074 return -EINVAL;
1065 } 1075 }
1066 rdev->sb_offset = sb_offset; 1076 rdev->sb_start = sb_start;
1067 1077
1068 /* superblock is rarely larger than 1K, but it can be larger, 1078 /* superblock is rarely larger than 1K, but it can be larger,
1069 * and it is safe to read 4k, so we do that 1079 * and it is safe to read 4k, so we do that
@@ -1077,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1077 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1087 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1078 sb->major_version != cpu_to_le32(1) || 1088 sb->major_version != cpu_to_le32(1) ||
1079 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1089 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1080 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1090 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1081 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1091 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1082 return -EINVAL; 1092 return -EINVAL;
1083 1093
@@ -1113,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1113 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1123 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1114 1124
1115 if (minor_version 1125 if (minor_version
1116 && rdev->data_offset < sb_offset + (rdev->sb_size/512)) 1126 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1117 return -EINVAL; 1127 return -EINVAL;
1118 1128
1119 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1129 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
@@ -1149,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1149 if (minor_version) 1159 if (minor_version)
1150 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1160 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1151 else 1161 else
1152 rdev->size = rdev->sb_offset; 1162 rdev->size = rdev->sb_start / 2;
1153 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1163 if (rdev->size < le64_to_cpu(sb->data_size)/2)
1154 return -EINVAL; 1164 return -EINVAL;
1155 rdev->size = le64_to_cpu(sb->data_size)/2; 1165 rdev->size = le64_to_cpu(sb->data_size)/2;
@@ -1328,35 +1338,74 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1328 sb->sb_csum = calc_sb_1_csum(sb); 1338 sb->sb_csum = calc_sb_1_csum(sb);
1329} 1339}
1330 1340
1341static unsigned long long
1342super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1343{
1344 struct mdp_superblock_1 *sb;
1345 sector_t max_sectors;
1346 if (num_sectors && num_sectors < rdev->mddev->size * 2)
1347 return 0; /* component must fit device */
1348 if (rdev->sb_start < rdev->data_offset) {
1349 /* minor versions 1 and 2; superblock before data */
1350 max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1351 max_sectors -= rdev->data_offset;
1352 if (!num_sectors || num_sectors > max_sectors)
1353 num_sectors = max_sectors;
1354 } else if (rdev->mddev->bitmap_offset) {
1355 /* minor version 0 with bitmap we can't move */
1356 return 0;
1357 } else {
1358 /* minor version 0; superblock after data */
1359 sector_t sb_start;
1360 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1361 sb_start &= ~(sector_t)(4*2 - 1);
1362 max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
1363 if (!num_sectors || num_sectors > max_sectors)
1364 num_sectors = max_sectors;
1365 rdev->sb_start = sb_start;
1366 }
1367 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1368 sb->data_size = cpu_to_le64(num_sectors);
1369 sb->super_offset = rdev->sb_start;
1370 sb->sb_csum = calc_sb_1_csum(sb);
1371 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1372 rdev->sb_page);
1373 md_super_wait(rdev->mddev);
1374 return num_sectors / 2; /* kB for sysfs */
1375}
1331 1376
1332static struct super_type super_types[] = { 1377static struct super_type super_types[] = {
1333 [0] = { 1378 [0] = {
1334 .name = "0.90.0", 1379 .name = "0.90.0",
1335 .owner = THIS_MODULE, 1380 .owner = THIS_MODULE,
1336 .load_super = super_90_load, 1381 .load_super = super_90_load,
1337 .validate_super = super_90_validate, 1382 .validate_super = super_90_validate,
1338 .sync_super = super_90_sync, 1383 .sync_super = super_90_sync,
1384 .rdev_size_change = super_90_rdev_size_change,
1339 }, 1385 },
1340 [1] = { 1386 [1] = {
1341 .name = "md-1", 1387 .name = "md-1",
1342 .owner = THIS_MODULE, 1388 .owner = THIS_MODULE,
1343 .load_super = super_1_load, 1389 .load_super = super_1_load,
1344 .validate_super = super_1_validate, 1390 .validate_super = super_1_validate,
1345 .sync_super = super_1_sync, 1391 .sync_super = super_1_sync,
1392 .rdev_size_change = super_1_rdev_size_change,
1346 }, 1393 },
1347}; 1394};
1348 1395
1349static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1396static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1350{ 1397{
1351 struct list_head *tmp, *tmp2;
1352 mdk_rdev_t *rdev, *rdev2; 1398 mdk_rdev_t *rdev, *rdev2;
1353 1399
1354 rdev_for_each(rdev, tmp, mddev1) 1400 rcu_read_lock();
1355 rdev_for_each(rdev2, tmp2, mddev2) 1401 rdev_for_each_rcu(rdev, mddev1)
1402 rdev_for_each_rcu(rdev2, mddev2)
1356 if (rdev->bdev->bd_contains == 1403 if (rdev->bdev->bd_contains ==
1357 rdev2->bdev->bd_contains) 1404 rdev2->bdev->bd_contains) {
1405 rcu_read_unlock();
1358 return 1; 1406 return 1;
1359 1407 }
1408 rcu_read_unlock();
1360 return 0; 1409 return 0;
1361} 1410}
1362 1411
@@ -1423,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1423 kobject_del(&rdev->kobj); 1472 kobject_del(&rdev->kobj);
1424 goto fail; 1473 goto fail;
1425 } 1474 }
1426 list_add(&rdev->same_set, &mddev->disks); 1475 list_add_rcu(&rdev->same_set, &mddev->disks);
1427 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1476 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1428 return 0; 1477 return 0;
1429 1478
@@ -1448,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1448 return; 1497 return;
1449 } 1498 }
1450 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1499 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1451 list_del_init(&rdev->same_set); 1500 list_del_rcu(&rdev->same_set);
1452 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1501 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1453 rdev->mddev = NULL; 1502 rdev->mddev = NULL;
1454 sysfs_remove_link(&rdev->kobj, "block"); 1503 sysfs_remove_link(&rdev->kobj, "block");
1455 1504
1456 /* We need to delay this, otherwise we can deadlock when 1505 /* We need to delay this, otherwise we can deadlock when
1457 * writing to 'remove' to "dev/state" 1506 * writing to 'remove' to "dev/state". We also need
1507 * to delay it due to rcu usage.
1458 */ 1508 */
1509 synchronize_rcu();
1459 INIT_WORK(&rdev->del_work, md_delayed_delete); 1510 INIT_WORK(&rdev->del_work, md_delayed_delete);
1460 kobject_get(&rdev->kobj); 1511 kobject_get(&rdev->kobj);
1461 schedule_work(&rdev->del_work); 1512 schedule_work(&rdev->del_work);
@@ -1511,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
1511 if (rdev->mddev) 1562 if (rdev->mddev)
1512 MD_BUG(); 1563 MD_BUG();
1513 free_disk_sb(rdev); 1564 free_disk_sb(rdev);
1514 list_del_init(&rdev->same_set);
1515#ifndef MODULE 1565#ifndef MODULE
1516 if (test_bit(AutoDetected, &rdev->flags)) 1566 if (test_bit(AutoDetected, &rdev->flags))
1517 md_autodetect_dev(rdev->bdev->bd_dev); 1567 md_autodetect_dev(rdev->bdev->bd_dev);
@@ -1758,11 +1808,11 @@ repeat:
1758 dprintk("%s ", bdevname(rdev->bdev,b)); 1808 dprintk("%s ", bdevname(rdev->bdev,b));
1759 if (!test_bit(Faulty, &rdev->flags)) { 1809 if (!test_bit(Faulty, &rdev->flags)) {
1760 md_super_write(mddev,rdev, 1810 md_super_write(mddev,rdev,
1761 rdev->sb_offset<<1, rdev->sb_size, 1811 rdev->sb_start, rdev->sb_size,
1762 rdev->sb_page); 1812 rdev->sb_page);
1763 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1813 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1764 bdevname(rdev->bdev,b), 1814 bdevname(rdev->bdev,b),
1765 (unsigned long long)rdev->sb_offset); 1815 (unsigned long long)rdev->sb_start);
1766 rdev->sb_events = mddev->events; 1816 rdev->sb_events = mddev->events;
1767 1817
1768 } else 1818 } else
@@ -1787,7 +1837,7 @@ repeat:
1787 1837
1788} 1838}
1789 1839
1790/* words written to sysfs files may, or my not, be \n terminated. 1840/* words written to sysfs files may, or may not, be \n terminated.
1791 * We want to accept with case. For this we use cmd_match. 1841 * We want to accept with case. For this we use cmd_match.
1792 */ 1842 */
1793static int cmd_match(const char *cmd, const char *str) 1843static int cmd_match(const char *cmd, const char *str)
@@ -1886,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1886 1936
1887 err = 0; 1937 err = 0;
1888 } 1938 }
1939 if (!err)
1940 sysfs_notify(&rdev->kobj, NULL, "state");
1889 return err ? err : len; 1941 return err ? err : len;
1890} 1942}
1891static struct rdev_sysfs_entry rdev_state = 1943static struct rdev_sysfs_entry rdev_state =
@@ -1931,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1931 slot = -1; 1983 slot = -1;
1932 else if (e==buf || (*e && *e!= '\n')) 1984 else if (e==buf || (*e && *e!= '\n'))
1933 return -EINVAL; 1985 return -EINVAL;
1934 if (rdev->mddev->pers) { 1986 if (rdev->mddev->pers && slot == -1) {
1935 /* Setting 'slot' on an active array requires also 1987 /* Setting 'slot' on an active array requires also
1936 * updating the 'rd%d' link, and communicating 1988 * updating the 'rd%d' link, and communicating
1937 * with the personality with ->hot_*_disk. 1989 * with the personality with ->hot_*_disk.
@@ -1939,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1939 * failed/spare devices. This normally happens automatically, 1991 * failed/spare devices. This normally happens automatically,
1940 * but not when the metadata is externally managed. 1992 * but not when the metadata is externally managed.
1941 */ 1993 */
1942 if (slot != -1)
1943 return -EBUSY;
1944 if (rdev->raid_disk == -1) 1994 if (rdev->raid_disk == -1)
1945 return -EEXIST; 1995 return -EEXIST;
1946 /* personality does all needed checks */ 1996 /* personality does all needed checks */
@@ -1954,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1954 sysfs_remove_link(&rdev->mddev->kobj, nm); 2004 sysfs_remove_link(&rdev->mddev->kobj, nm);
1955 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2005 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1956 md_wakeup_thread(rdev->mddev->thread); 2006 md_wakeup_thread(rdev->mddev->thread);
2007 } else if (rdev->mddev->pers) {
2008 mdk_rdev_t *rdev2;
2009 struct list_head *tmp;
2010 /* Activating a spare .. or possibly reactivating
2011 * if we every get bitmaps working here.
2012 */
2013
2014 if (rdev->raid_disk != -1)
2015 return -EBUSY;
2016
2017 if (rdev->mddev->pers->hot_add_disk == NULL)
2018 return -EINVAL;
2019
2020 rdev_for_each(rdev2, tmp, rdev->mddev)
2021 if (rdev2->raid_disk == slot)
2022 return -EEXIST;
2023
2024 rdev->raid_disk = slot;
2025 if (test_bit(In_sync, &rdev->flags))
2026 rdev->saved_raid_disk = slot;
2027 else
2028 rdev->saved_raid_disk = -1;
2029 err = rdev->mddev->pers->
2030 hot_add_disk(rdev->mddev, rdev);
2031 if (err) {
2032 rdev->raid_disk = -1;
2033 return err;
2034 } else
2035 sysfs_notify(&rdev->kobj, NULL, "state");
2036 sprintf(nm, "rd%d", rdev->raid_disk);
2037 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2038 printk(KERN_WARNING
2039 "md: cannot register "
2040 "%s for %s\n",
2041 nm, mdname(rdev->mddev));
2042
2043 /* don't wakeup anyone, leave that to userspace. */
1957 } else { 2044 } else {
1958 if (slot >= rdev->mddev->raid_disks) 2045 if (slot >= rdev->mddev->raid_disks)
1959 return -ENOSPC; 2046 return -ENOSPC;
@@ -1962,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1962 clear_bit(Faulty, &rdev->flags); 2049 clear_bit(Faulty, &rdev->flags);
1963 clear_bit(WriteMostly, &rdev->flags); 2050 clear_bit(WriteMostly, &rdev->flags);
1964 set_bit(In_sync, &rdev->flags); 2051 set_bit(In_sync, &rdev->flags);
2052 sysfs_notify(&rdev->kobj, NULL, "state");
1965 } 2053 }
1966 return len; 2054 return len;
1967} 2055}
@@ -1983,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1983 unsigned long long offset = simple_strtoull(buf, &e, 10); 2071 unsigned long long offset = simple_strtoull(buf, &e, 10);
1984 if (e==buf || (*e && *e != '\n')) 2072 if (e==buf || (*e && *e != '\n'))
1985 return -EINVAL; 2073 return -EINVAL;
1986 if (rdev->mddev->pers) 2074 if (rdev->mddev->pers && rdev->raid_disk >= 0)
1987 return -EBUSY; 2075 return -EBUSY;
1988 if (rdev->size && rdev->mddev->external) 2076 if (rdev->size && rdev->mddev->external)
1989 /* Must set offset before size, so overlap checks 2077 /* Must set offset before size, so overlap checks
@@ -2015,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2015static ssize_t 2103static ssize_t
2016rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2104rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2017{ 2105{
2018 char *e; 2106 unsigned long long size;
2019 unsigned long long size = simple_strtoull(buf, &e, 10);
2020 unsigned long long oldsize = rdev->size; 2107 unsigned long long oldsize = rdev->size;
2021 mddev_t *my_mddev = rdev->mddev; 2108 mddev_t *my_mddev = rdev->mddev;
2022 2109
2023 if (e==buf || (*e && *e != '\n')) 2110 if (strict_strtoull(buf, 10, &size) < 0)
2024 return -EINVAL; 2111 return -EINVAL;
2025 if (my_mddev->pers) 2112 if (size < my_mddev->size)
2026 return -EBUSY; 2113 return -EINVAL;
2114 if (my_mddev->pers && rdev->raid_disk >= 0) {
2115 if (my_mddev->persistent) {
2116 size = super_types[my_mddev->major_version].
2117 rdev_size_change(rdev, size * 2);
2118 if (!size)
2119 return -EBUSY;
2120 } else if (!size) {
2121 size = (rdev->bdev->bd_inode->i_size >> 10);
2122 size -= rdev->data_offset/2;
2123 }
2124 if (size < my_mddev->size)
2125 return -EINVAL; /* component must fit device */
2126 }
2127
2027 rdev->size = size; 2128 rdev->size = size;
2028 if (size > oldsize && rdev->mddev->external) { 2129 if (size > oldsize && my_mddev->external) {
2029 /* need to check that all other rdevs with the same ->bdev 2130 /* need to check that all other rdevs with the same ->bdev
2030 * do not overlap. We need to unlock the mddev to avoid 2131 * do not overlap. We need to unlock the mddev to avoid
2031 * a deadlock. We have already changed rdev->size, and if 2132 * a deadlock. We have already changed rdev->size, and if
@@ -2044,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2044 if (test_bit(AllReserved, &rdev2->flags) || 2145 if (test_bit(AllReserved, &rdev2->flags) ||
2045 (rdev->bdev == rdev2->bdev && 2146 (rdev->bdev == rdev2->bdev &&
2046 rdev != rdev2 && 2147 rdev != rdev2 &&
2047 overlaps(rdev->data_offset, rdev->size, 2148 overlaps(rdev->data_offset, rdev->size * 2,
2048 rdev2->data_offset, rdev2->size))) { 2149 rdev2->data_offset,
2150 rdev2->size * 2))) {
2049 overlap = 1; 2151 overlap = 1;
2050 break; 2152 break;
2051 } 2153 }
@@ -2067,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2067 return -EBUSY; 2169 return -EBUSY;
2068 } 2170 }
2069 } 2171 }
2070 if (size < my_mddev->size || my_mddev->size == 0)
2071 my_mddev->size = size;
2072 return len; 2172 return len;
2073} 2173}
2074 2174
@@ -2512,7 +2612,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2512 * When written, doesn't tear down array, but just stops it 2612 * When written, doesn't tear down array, but just stops it
2513 * suspended (not supported yet) 2613 * suspended (not supported yet)
2514 * All IO requests will block. The array can be reconfigured. 2614 * All IO requests will block. The array can be reconfigured.
2515 * Writing this, if accepted, will block until array is quiessent 2615 * Writing this, if accepted, will block until array is quiescent
2516 * readonly 2616 * readonly
2517 * no resync can happen. no superblocks get written. 2617 * no resync can happen. no superblocks get written.
2518 * write requests fail 2618 * write requests fail
@@ -2585,7 +2685,7 @@ array_state_show(mddev_t *mddev, char *page)
2585 return sprintf(page, "%s\n", array_states[st]); 2685 return sprintf(page, "%s\n", array_states[st]);
2586} 2686}
2587 2687
2588static int do_md_stop(mddev_t * mddev, int ro); 2688static int do_md_stop(mddev_t * mddev, int ro, int is_open);
2589static int do_md_run(mddev_t * mddev); 2689static int do_md_run(mddev_t * mddev);
2590static int restart_array(mddev_t *mddev); 2690static int restart_array(mddev_t *mddev);
2591 2691
@@ -2599,16 +2699,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2599 break; 2699 break;
2600 case clear: 2700 case clear:
2601 /* stopping an active array */ 2701 /* stopping an active array */
2602 if (atomic_read(&mddev->active) > 1) 2702 if (atomic_read(&mddev->openers) > 0)
2603 return -EBUSY; 2703 return -EBUSY;
2604 err = do_md_stop(mddev, 0); 2704 err = do_md_stop(mddev, 0, 0);
2605 break; 2705 break;
2606 case inactive: 2706 case inactive:
2607 /* stopping an active array */ 2707 /* stopping an active array */
2608 if (mddev->pers) { 2708 if (mddev->pers) {
2609 if (atomic_read(&mddev->active) > 1) 2709 if (atomic_read(&mddev->openers) > 0)
2610 return -EBUSY; 2710 return -EBUSY;
2611 err = do_md_stop(mddev, 2); 2711 err = do_md_stop(mddev, 2, 0);
2612 } else 2712 } else
2613 err = 0; /* already inactive */ 2713 err = 0; /* already inactive */
2614 break; 2714 break;
@@ -2616,7 +2716,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2616 break; /* not supported yet */ 2716 break; /* not supported yet */
2617 case readonly: 2717 case readonly:
2618 if (mddev->pers) 2718 if (mddev->pers)
2619 err = do_md_stop(mddev, 1); 2719 err = do_md_stop(mddev, 1, 0);
2620 else { 2720 else {
2621 mddev->ro = 1; 2721 mddev->ro = 1;
2622 set_disk_ro(mddev->gendisk, 1); 2722 set_disk_ro(mddev->gendisk, 1);
@@ -2626,7 +2726,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2626 case read_auto: 2726 case read_auto:
2627 if (mddev->pers) { 2727 if (mddev->pers) {
2628 if (mddev->ro != 1) 2728 if (mddev->ro != 1)
2629 err = do_md_stop(mddev, 1); 2729 err = do_md_stop(mddev, 1, 0);
2630 else 2730 else
2631 err = restart_array(mddev); 2731 err = restart_array(mddev);
2632 if (err == 0) { 2732 if (err == 0) {
@@ -2681,8 +2781,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2681 } 2781 }
2682 if (err) 2782 if (err)
2683 return err; 2783 return err;
2684 else 2784 else {
2785 sysfs_notify(&mddev->kobj, NULL, "array_state");
2685 return len; 2786 return len;
2787 }
2686} 2788}
2687static struct md_sysfs_entry md_array_state = 2789static struct md_sysfs_entry md_array_state =
2688__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2790__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -2785,7 +2887,7 @@ size_show(mddev_t *mddev, char *page)
2785 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2887 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2786} 2888}
2787 2889
2788static int update_size(mddev_t *mddev, unsigned long size); 2890static int update_size(mddev_t *mddev, sector_t num_sectors);
2789 2891
2790static ssize_t 2892static ssize_t
2791size_store(mddev_t *mddev, const char *buf, size_t len) 2893size_store(mddev_t *mddev, const char *buf, size_t len)
@@ -2802,7 +2904,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
2802 return -EINVAL; 2904 return -EINVAL;
2803 2905
2804 if (mddev->pers) { 2906 if (mddev->pers) {
2805 err = update_size(mddev, size); 2907 err = update_size(mddev, size * 2);
2806 md_update_sb(mddev, 1); 2908 md_update_sb(mddev, 1);
2807 } else { 2909 } else {
2808 if (mddev->size == 0 || 2910 if (mddev->size == 0 ||
@@ -2899,7 +3001,7 @@ action_show(mddev_t *mddev, char *page)
2899 type = "check"; 3001 type = "check";
2900 else 3002 else
2901 type = "repair"; 3003 type = "repair";
2902 } else 3004 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
2903 type = "recover"; 3005 type = "recover";
2904 } 3006 }
2905 return sprintf(page, "%s\n", type); 3007 return sprintf(page, "%s\n", type);
@@ -2921,15 +3023,19 @@ action_store(mddev_t *mddev, const char *page, size_t len)
2921 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3023 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2922 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3024 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2923 return -EBUSY; 3025 return -EBUSY;
2924 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 3026 else if (cmd_match(page, "resync"))
3027 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3028 else if (cmd_match(page, "recover")) {
3029 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2925 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3030 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2926 else if (cmd_match(page, "reshape")) { 3031 } else if (cmd_match(page, "reshape")) {
2927 int err; 3032 int err;
2928 if (mddev->pers->start_reshape == NULL) 3033 if (mddev->pers->start_reshape == NULL)
2929 return -EINVAL; 3034 return -EINVAL;
2930 err = mddev->pers->start_reshape(mddev); 3035 err = mddev->pers->start_reshape(mddev);
2931 if (err) 3036 if (err)
2932 return err; 3037 return err;
3038 sysfs_notify(&mddev->kobj, NULL, "degraded");
2933 } else { 3039 } else {
2934 if (cmd_match(page, "check")) 3040 if (cmd_match(page, "check"))
2935 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3041 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -2940,6 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
2940 } 3046 }
2941 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3047 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2942 md_wakeup_thread(mddev->thread); 3048 md_wakeup_thread(mddev->thread);
3049 sysfs_notify(&mddev->kobj, NULL, "sync_action");
2943 return len; 3050 return len;
2944} 3051}
2945 3052
@@ -3049,11 +3156,11 @@ static ssize_t
3049sync_speed_show(mddev_t *mddev, char *page) 3156sync_speed_show(mddev_t *mddev, char *page)
3050{ 3157{
3051 unsigned long resync, dt, db; 3158 unsigned long resync, dt, db;
3052 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 3159 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3053 dt = ((jiffies - mddev->resync_mark) / HZ); 3160 dt = (jiffies - mddev->resync_mark) / HZ;
3054 if (!dt) dt++; 3161 if (!dt) dt++;
3055 db = resync - (mddev->resync_mark_cnt); 3162 db = resync - mddev->resync_mark_cnt;
3056 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 3163 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3057} 3164}
3058 3165
3059static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3166static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
@@ -3075,6 +3182,36 @@ sync_completed_show(mddev_t *mddev, char *page)
3075static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3182static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3076 3183
3077static ssize_t 3184static ssize_t
3185min_sync_show(mddev_t *mddev, char *page)
3186{
3187 return sprintf(page, "%llu\n",
3188 (unsigned long long)mddev->resync_min);
3189}
3190static ssize_t
3191min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3192{
3193 unsigned long long min;
3194 if (strict_strtoull(buf, 10, &min))
3195 return -EINVAL;
3196 if (min > mddev->resync_max)
3197 return -EINVAL;
3198 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3199 return -EBUSY;
3200
3201 /* Must be a multiple of chunk_size */
3202 if (mddev->chunk_size) {
3203 if (min & (sector_t)((mddev->chunk_size>>9)-1))
3204 return -EINVAL;
3205 }
3206 mddev->resync_min = min;
3207
3208 return len;
3209}
3210
3211static struct md_sysfs_entry md_min_sync =
3212__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3213
3214static ssize_t
3078max_sync_show(mddev_t *mddev, char *page) 3215max_sync_show(mddev_t *mddev, char *page)
3079{ 3216{
3080 if (mddev->resync_max == MaxSector) 3217 if (mddev->resync_max == MaxSector)
@@ -3089,9 +3226,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3089 if (strncmp(buf, "max", 3) == 0) 3226 if (strncmp(buf, "max", 3) == 0)
3090 mddev->resync_max = MaxSector; 3227 mddev->resync_max = MaxSector;
3091 else { 3228 else {
3092 char *ep; 3229 unsigned long long max;
3093 unsigned long long max = simple_strtoull(buf, &ep, 10); 3230 if (strict_strtoull(buf, 10, &max))
3094 if (ep == buf || (*ep != 0 && *ep != '\n')) 3231 return -EINVAL;
3232 if (max < mddev->resync_min)
3095 return -EINVAL; 3233 return -EINVAL;
3096 if (max < mddev->resync_max && 3234 if (max < mddev->resync_max &&
3097 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3235 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3222,6 +3360,7 @@ static struct attribute *md_redundancy_attrs[] = {
3222 &md_sync_speed.attr, 3360 &md_sync_speed.attr,
3223 &md_sync_force_parallel.attr, 3361 &md_sync_force_parallel.attr,
3224 &md_sync_completed.attr, 3362 &md_sync_completed.attr,
3363 &md_min_sync.attr,
3225 &md_max_sync.attr, 3364 &md_max_sync.attr,
3226 &md_suspend_lo.attr, 3365 &md_suspend_lo.attr,
3227 &md_suspend_hi.attr, 3366 &md_suspend_hi.attr,
@@ -3326,9 +3465,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
3326 disk->queue = mddev->queue; 3465 disk->queue = mddev->queue;
3327 add_disk(disk); 3466 add_disk(disk);
3328 mddev->gendisk = disk; 3467 mddev->gendisk = disk;
3329 mutex_unlock(&disks_mutex);
3330 error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, 3468 error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
3331 "%s", "md"); 3469 "%s", "md");
3470 mutex_unlock(&disks_mutex);
3332 if (error) 3471 if (error)
3333 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3472 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3334 disk->disk_name); 3473 disk->disk_name);
@@ -3341,7 +3480,11 @@ static void md_safemode_timeout(unsigned long data)
3341{ 3480{
3342 mddev_t *mddev = (mddev_t *) data; 3481 mddev_t *mddev = (mddev_t *) data;
3343 3482
3344 mddev->safemode = 1; 3483 if (!atomic_read(&mddev->writes_pending)) {
3484 mddev->safemode = 1;
3485 if (mddev->external)
3486 sysfs_notify(&mddev->kobj, NULL, "array_state");
3487 }
3345 md_wakeup_thread(mddev->thread); 3488 md_wakeup_thread(mddev->thread);
3346} 3489}
3347 3490
@@ -3432,22 +3575,23 @@ static int do_md_run(mddev_t * mddev)
3432 * We don't want the data to overlap the metadata, 3575 * We don't want the data to overlap the metadata,
3433 * Internal Bitmap issues has handled elsewhere. 3576 * Internal Bitmap issues has handled elsewhere.
3434 */ 3577 */
3435 if (rdev->data_offset < rdev->sb_offset) { 3578 if (rdev->data_offset < rdev->sb_start) {
3436 if (mddev->size && 3579 if (mddev->size &&
3437 rdev->data_offset + mddev->size*2 3580 rdev->data_offset + mddev->size*2
3438 > rdev->sb_offset*2) { 3581 > rdev->sb_start) {
3439 printk("md: %s: data overlaps metadata\n", 3582 printk("md: %s: data overlaps metadata\n",
3440 mdname(mddev)); 3583 mdname(mddev));
3441 return -EINVAL; 3584 return -EINVAL;
3442 } 3585 }
3443 } else { 3586 } else {
3444 if (rdev->sb_offset*2 + rdev->sb_size/512 3587 if (rdev->sb_start + rdev->sb_size/512
3445 > rdev->data_offset) { 3588 > rdev->data_offset) {
3446 printk("md: %s: metadata overlaps data\n", 3589 printk("md: %s: metadata overlaps data\n",
3447 mdname(mddev)); 3590 mdname(mddev));
3448 return -EINVAL; 3591 return -EINVAL;
3449 } 3592 }
3450 } 3593 }
3594 sysfs_notify(&rdev->kobj, NULL, "state");
3451 } 3595 }
3452 3596
3453 md_probe(mddev->unit, NULL, NULL); 3597 md_probe(mddev->unit, NULL, NULL);
@@ -3519,7 +3663,9 @@ static int do_md_run(mddev_t * mddev)
3519 mddev->ro = 2; /* read-only, but switch on first write */ 3663 mddev->ro = 2; /* read-only, but switch on first write */
3520 3664
3521 err = mddev->pers->run(mddev); 3665 err = mddev->pers->run(mddev);
3522 if (!err && mddev->pers->sync_request) { 3666 if (err)
3667 printk(KERN_ERR "md: pers->run() failed ...\n");
3668 else if (mddev->pers->sync_request) {
3523 err = bitmap_create(mddev); 3669 err = bitmap_create(mddev);
3524 if (err) { 3670 if (err) {
3525 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3671 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3528,7 +3674,6 @@ static int do_md_run(mddev_t * mddev)
3528 } 3674 }
3529 } 3675 }
3530 if (err) { 3676 if (err) {
3531 printk(KERN_ERR "md: pers->run() failed ...\n");
3532 module_put(mddev->pers->owner); 3677 module_put(mddev->pers->owner);
3533 mddev->pers = NULL; 3678 mddev->pers = NULL;
3534 bitmap_destroy(mddev); 3679 bitmap_destroy(mddev);
@@ -3563,7 +3708,7 @@ static int do_md_run(mddev_t * mddev)
3563 if (mddev->flags) 3708 if (mddev->flags)
3564 md_update_sb(mddev, 0); 3709 md_update_sb(mddev, 0);
3565 3710
3566 set_capacity(disk, mddev->array_size<<1); 3711 set_capacity(disk, mddev->array_sectors);
3567 3712
3568 /* If we call blk_queue_make_request here, it will 3713 /* If we call blk_queue_make_request here, it will
3569 * re-initialise max_sectors etc which may have been 3714 * re-initialise max_sectors etc which may have been
@@ -3608,6 +3753,9 @@ static int do_md_run(mddev_t * mddev)
3608 3753
3609 mddev->changed = 1; 3754 mddev->changed = 1;
3610 md_new_event(mddev); 3755 md_new_event(mddev);
3756 sysfs_notify(&mddev->kobj, NULL, "array_state");
3757 sysfs_notify(&mddev->kobj, NULL, "sync_action");
3758 sysfs_notify(&mddev->kobj, NULL, "degraded");
3611 kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); 3759 kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
3612 return 0; 3760 return 0;
3613} 3761}
@@ -3615,38 +3763,25 @@ static int do_md_run(mddev_t * mddev)
3615static int restart_array(mddev_t *mddev) 3763static int restart_array(mddev_t *mddev)
3616{ 3764{
3617 struct gendisk *disk = mddev->gendisk; 3765 struct gendisk *disk = mddev->gendisk;
3618 int err;
3619 3766
3620 /* 3767 /* Complain if it has no devices */
3621 * Complain if it has no devices
3622 */
3623 err = -ENXIO;
3624 if (list_empty(&mddev->disks)) 3768 if (list_empty(&mddev->disks))
3625 goto out; 3769 return -ENXIO;
3626 3770 if (!mddev->pers)
3627 if (mddev->pers) { 3771 return -EINVAL;
3628 err = -EBUSY; 3772 if (!mddev->ro)
3629 if (!mddev->ro) 3773 return -EBUSY;
3630 goto out; 3774 mddev->safemode = 0;
3631 3775 mddev->ro = 0;
3632 mddev->safemode = 0; 3776 set_disk_ro(disk, 0);
3633 mddev->ro = 0; 3777 printk(KERN_INFO "md: %s switched to read-write mode.\n",
3634 set_disk_ro(disk, 0); 3778 mdname(mddev));
3635 3779 /* Kick recovery or resync if necessary */
3636 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3780 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3637 mdname(mddev)); 3781 md_wakeup_thread(mddev->thread);
3638 /* 3782 md_wakeup_thread(mddev->sync_thread);
3639 * Kick recovery or resync if necessary 3783 sysfs_notify(&mddev->kobj, NULL, "array_state");
3640 */ 3784 return 0;
3641 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3642 md_wakeup_thread(mddev->thread);
3643 md_wakeup_thread(mddev->sync_thread);
3644 err = 0;
3645 } else
3646 err = -EINVAL;
3647
3648out:
3649 return err;
3650} 3785}
3651 3786
3652/* similar to deny_write_access, but accounts for our holding a reference 3787/* similar to deny_write_access, but accounts for our holding a reference
@@ -3680,16 +3815,17 @@ static void restore_bitmap_write_access(struct file *file)
3680 * 1 - switch to readonly 3815 * 1 - switch to readonly
3681 * 2 - stop but do not disassemble array 3816 * 2 - stop but do not disassemble array
3682 */ 3817 */
3683static int do_md_stop(mddev_t * mddev, int mode) 3818static int do_md_stop(mddev_t * mddev, int mode, int is_open)
3684{ 3819{
3685 int err = 0; 3820 int err = 0;
3686 struct gendisk *disk = mddev->gendisk; 3821 struct gendisk *disk = mddev->gendisk;
3687 3822
3823 if (atomic_read(&mddev->openers) > is_open) {
3824 printk("md: %s still in use.\n",mdname(mddev));
3825 return -EBUSY;
3826 }
3827
3688 if (mddev->pers) { 3828 if (mddev->pers) {
3689 if (atomic_read(&mddev->active)>2) {
3690 printk("md: %s still in use.\n",mdname(mddev));
3691 return -EBUSY;
3692 }
3693 3829
3694 if (mddev->sync_thread) { 3830 if (mddev->sync_thread) {
3695 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3831 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -3773,10 +3909,11 @@ static int do_md_stop(mddev_t * mddev, int mode)
3773 3909
3774 export_array(mddev); 3910 export_array(mddev);
3775 3911
3776 mddev->array_size = 0; 3912 mddev->array_sectors = 0;
3777 mddev->size = 0; 3913 mddev->size = 0;
3778 mddev->raid_disks = 0; 3914 mddev->raid_disks = 0;
3779 mddev->recovery_cp = 0; 3915 mddev->recovery_cp = 0;
3916 mddev->resync_min = 0;
3780 mddev->resync_max = MaxSector; 3917 mddev->resync_max = MaxSector;
3781 mddev->reshape_position = MaxSector; 3918 mddev->reshape_position = MaxSector;
3782 mddev->external = 0; 3919 mddev->external = 0;
@@ -3811,6 +3948,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
3811 mdname(mddev)); 3948 mdname(mddev));
3812 err = 0; 3949 err = 0;
3813 md_new_event(mddev); 3950 md_new_event(mddev);
3951 sysfs_notify(&mddev->kobj, NULL, "array_state");
3814out: 3952out:
3815 return err; 3953 return err;
3816} 3954}
@@ -3836,7 +3974,7 @@ static void autorun_array(mddev_t *mddev)
3836 err = do_md_run (mddev); 3974 err = do_md_run (mddev);
3837 if (err) { 3975 if (err) {
3838 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3976 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3839 do_md_stop (mddev, 0); 3977 do_md_stop (mddev, 0, 0);
3840 } 3978 }
3841} 3979}
3842 3980
@@ -3927,8 +4065,10 @@ static void autorun_devices(int part)
3927 /* on success, candidates will be empty, on error 4065 /* on success, candidates will be empty, on error
3928 * it won't... 4066 * it won't...
3929 */ 4067 */
3930 rdev_for_each_list(rdev, tmp, candidates) 4068 rdev_for_each_list(rdev, tmp, candidates) {
4069 list_del_init(&rdev->same_set);
3931 export_rdev(rdev); 4070 export_rdev(rdev);
4071 }
3932 mddev_put(mddev); 4072 mddev_put(mddev);
3933 } 4073 }
3934 printk(KERN_INFO "md: ... autorun DONE.\n"); 4074 printk(KERN_INFO "md: ... autorun DONE.\n");
@@ -4009,9 +4149,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4009 char *ptr, *buf = NULL; 4149 char *ptr, *buf = NULL;
4010 int err = -ENOMEM; 4150 int err = -ENOMEM;
4011 4151
4012 md_allow_write(mddev); 4152 if (md_allow_write(mddev))
4153 file = kmalloc(sizeof(*file), GFP_NOIO);
4154 else
4155 file = kmalloc(sizeof(*file), GFP_KERNEL);
4013 4156
4014 file = kmalloc(sizeof(*file), GFP_KERNEL);
4015 if (!file) 4157 if (!file)
4016 goto out; 4158 goto out;
4017 4159
@@ -4044,15 +4186,12 @@ out:
4044static int get_disk_info(mddev_t * mddev, void __user * arg) 4186static int get_disk_info(mddev_t * mddev, void __user * arg)
4045{ 4187{
4046 mdu_disk_info_t info; 4188 mdu_disk_info_t info;
4047 unsigned int nr;
4048 mdk_rdev_t *rdev; 4189 mdk_rdev_t *rdev;
4049 4190
4050 if (copy_from_user(&info, arg, sizeof(info))) 4191 if (copy_from_user(&info, arg, sizeof(info)))
4051 return -EFAULT; 4192 return -EFAULT;
4052 4193
4053 nr = info.number; 4194 rdev = find_rdev_nr(mddev, info.number);
4054
4055 rdev = find_rdev_nr(mddev, nr);
4056 if (rdev) { 4195 if (rdev) {
4057 info.major = MAJOR(rdev->bdev->bd_dev); 4196 info.major = MAJOR(rdev->bdev->bd_dev);
4058 info.minor = MINOR(rdev->bdev->bd_dev); 4197 info.minor = MINOR(rdev->bdev->bd_dev);
@@ -4172,8 +4311,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4172 } 4311 }
4173 if (err) 4312 if (err)
4174 export_rdev(rdev); 4313 export_rdev(rdev);
4314 else
4315 sysfs_notify(&rdev->kobj, NULL, "state");
4175 4316
4176 md_update_sb(mddev, 1); 4317 md_update_sb(mddev, 1);
4318 if (mddev->degraded)
4319 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4320 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4178 md_wakeup_thread(mddev->thread); 4321 md_wakeup_thread(mddev->thread);
4179 return err; 4322 return err;
@@ -4212,10 +4355,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4212 4355
4213 if (!mddev->persistent) { 4356 if (!mddev->persistent) {
4214 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 4357 printk(KERN_INFO "md: nonpersistent superblock ...\n");
4215 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 4358 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4216 } else 4359 } else
4217 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 4360 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4218 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 4361 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
4219 4362
4220 err = bind_rdev_to_array(rdev, mddev); 4363 err = bind_rdev_to_array(rdev, mddev);
4221 if (err) { 4364 if (err) {
@@ -4232,9 +4375,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4232 char b[BDEVNAME_SIZE]; 4375 char b[BDEVNAME_SIZE];
4233 mdk_rdev_t *rdev; 4376 mdk_rdev_t *rdev;
4234 4377
4235 if (!mddev->pers)
4236 return -ENODEV;
4237
4238 rdev = find_rdev(mddev, dev); 4378 rdev = find_rdev(mddev, dev);
4239 if (!rdev) 4379 if (!rdev)
4240 return -ENXIO; 4380 return -ENXIO;
@@ -4257,7 +4397,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
4257{ 4397{
4258 char b[BDEVNAME_SIZE]; 4398 char b[BDEVNAME_SIZE];
4259 int err; 4399 int err;
4260 unsigned int size;
4261 mdk_rdev_t *rdev; 4400 mdk_rdev_t *rdev;
4262 4401
4263 if (!mddev->pers) 4402 if (!mddev->pers)
@@ -4285,13 +4424,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
4285 } 4424 }
4286 4425
4287 if (mddev->persistent) 4426 if (mddev->persistent)
4288 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 4427 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4289 else 4428 else
4290 rdev->sb_offset = 4429 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4291 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4292 4430
4293 size = calc_dev_size(rdev, mddev->chunk_size); 4431 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
4294 rdev->size = size;
4295 4432
4296 if (test_bit(Faulty, &rdev->flags)) { 4433 if (test_bit(Faulty, &rdev->flags)) {
4297 printk(KERN_WARNING 4434 printk(KERN_WARNING
@@ -4476,24 +4613,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4476 return 0; 4613 return 0;
4477} 4614}
4478 4615
4479static int update_size(mddev_t *mddev, unsigned long size) 4616static int update_size(mddev_t *mddev, sector_t num_sectors)
4480{ 4617{
4481 mdk_rdev_t * rdev; 4618 mdk_rdev_t * rdev;
4482 int rv; 4619 int rv;
4483 struct list_head *tmp; 4620 struct list_head *tmp;
4484 int fit = (size == 0); 4621 int fit = (num_sectors == 0);
4485 4622
4486 if (mddev->pers->resize == NULL) 4623 if (mddev->pers->resize == NULL)
4487 return -EINVAL; 4624 return -EINVAL;
4488 /* The "size" is the amount of each device that is used. 4625 /* The "num_sectors" is the number of sectors of each device that
4489 * This can only make sense for arrays with redundancy. 4626 * is used. This can only make sense for arrays with redundancy.
4490 * linear and raid0 always use whatever space is available 4627 * linear and raid0 always use whatever space is available. We can only
4491 * We can only consider changing the size if no resync 4628 * consider changing this number if no resync or reconstruction is
4492 * or reconstruction is happening, and if the new size 4629 * happening, and if the new size is acceptable. It must fit before the
4493 * is acceptable. It must fit before the sb_offset or, 4630 * sb_start or, if that is <data_offset, it must fit before the size
4494 * if that is <data_offset, it must fit before the 4631 * of each device. If num_sectors is zero, we find the largest size
4495 * size of each device. 4632 * that fits.
4496 * If size is zero, we find the largest size that fits. 4633
4497 */ 4634 */
4498 if (mddev->sync_thread) 4635 if (mddev->sync_thread)
4499 return -EBUSY; 4636 return -EBUSY;
@@ -4501,19 +4638,20 @@ static int update_size(mddev_t *mddev, unsigned long size)
4501 sector_t avail; 4638 sector_t avail;
4502 avail = rdev->size * 2; 4639 avail = rdev->size * 2;
4503 4640
4504 if (fit && (size == 0 || size > avail/2)) 4641 if (fit && (num_sectors == 0 || num_sectors > avail))
4505 size = avail/2; 4642 num_sectors = avail;
4506 if (avail < ((sector_t)size << 1)) 4643 if (avail < num_sectors)
4507 return -ENOSPC; 4644 return -ENOSPC;
4508 } 4645 }
4509 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4646 rv = mddev->pers->resize(mddev, num_sectors);
4510 if (!rv) { 4647 if (!rv) {
4511 struct block_device *bdev; 4648 struct block_device *bdev;
4512 4649
4513 bdev = bdget_disk(mddev->gendisk, 0); 4650 bdev = bdget_disk(mddev->gendisk, 0);
4514 if (bdev) { 4651 if (bdev) {
4515 mutex_lock(&bdev->bd_inode->i_mutex); 4652 mutex_lock(&bdev->bd_inode->i_mutex);
4516 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4653 i_size_write(bdev->bd_inode,
4654 (loff_t)mddev->array_sectors << 9);
4517 mutex_unlock(&bdev->bd_inode->i_mutex); 4655 mutex_unlock(&bdev->bd_inode->i_mutex);
4518 bdput(bdev); 4656 bdput(bdev);
4519 } 4657 }
@@ -4588,7 +4726,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4588 return mddev->pers->reconfig(mddev, info->layout, -1); 4726 return mddev->pers->reconfig(mddev, info->layout, -1);
4589 } 4727 }
4590 if (info->size >= 0 && mddev->size != info->size) 4728 if (info->size >= 0 && mddev->size != info->size)
4591 rv = update_size(mddev, info->size); 4729 rv = update_size(mddev, (sector_t)info->size * 2);
4592 4730
4593 if (mddev->raid_disks != info->raid_disks) 4731 if (mddev->raid_disks != info->raid_disks)
4594 rv = update_raid_disks(mddev, info->raid_disks); 4732 rv = update_raid_disks(mddev, info->raid_disks);
@@ -4641,6 +4779,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4641 return 0; 4779 return 0;
4642} 4780}
4643 4781
4782/*
4783 * We have a problem here : there is no easy way to give a CHS
4784 * virtual geometry. We currently pretend that we have a 2 heads
4785 * 4 sectors (with a BIG number of cylinders...). This drives
4786 * dosfs just mad... ;-)
4787 */
4644static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4788static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4645{ 4789{
4646 mddev_t *mddev = bdev->bd_disk->private_data; 4790 mddev_t *mddev = bdev->bd_disk->private_data;
@@ -4785,19 +4929,13 @@ static int md_ioctl(struct inode *inode, struct file *file,
4785 goto done_unlock; 4929 goto done_unlock;
4786 4930
4787 case STOP_ARRAY: 4931 case STOP_ARRAY:
4788 err = do_md_stop (mddev, 0); 4932 err = do_md_stop (mddev, 0, 1);
4789 goto done_unlock; 4933 goto done_unlock;
4790 4934
4791 case STOP_ARRAY_RO: 4935 case STOP_ARRAY_RO:
4792 err = do_md_stop (mddev, 1); 4936 err = do_md_stop (mddev, 1, 1);
4793 goto done_unlock; 4937 goto done_unlock;
4794 4938
4795 /*
4796 * We have a problem here : there is no easy way to give a CHS
4797 * virtual geometry. We currently pretend that we have a 2 heads
4798 * 4 sectors (with a BIG number of cylinders...). This drives
4799 * dosfs just mad... ;-)
4800 */
4801 } 4939 }
4802 4940
4803 /* 4941 /*
@@ -4807,13 +4945,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
4807 * here and hit the 'default' below, so only disallow 4945 * here and hit the 'default' below, so only disallow
4808 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4946 * 'md' ioctls, and switch to rw mode if started auto-readonly.
4809 */ 4947 */
4810 if (_IOC_TYPE(cmd) == MD_MAJOR && 4948 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
4811 mddev->ro && mddev->pers) {
4812 if (mddev->ro == 2) { 4949 if (mddev->ro == 2) {
4813 mddev->ro = 0; 4950 mddev->ro = 0;
4814 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4951 sysfs_notify(&mddev->kobj, NULL, "array_state");
4815 md_wakeup_thread(mddev->thread); 4952 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4816 4953 md_wakeup_thread(mddev->thread);
4817 } else { 4954 } else {
4818 err = -EROFS; 4955 err = -EROFS;
4819 goto abort_unlock; 4956 goto abort_unlock;
@@ -4883,6 +5020,7 @@ static int md_open(struct inode *inode, struct file *file)
4883 5020
4884 err = 0; 5021 err = 0;
4885 mddev_get(mddev); 5022 mddev_get(mddev);
5023 atomic_inc(&mddev->openers);
4886 mddev_unlock(mddev); 5024 mddev_unlock(mddev);
4887 5025
4888 check_disk_change(inode->i_bdev); 5026 check_disk_change(inode->i_bdev);
@@ -4895,6 +5033,7 @@ static int md_release(struct inode *inode, struct file * file)
4895 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 5033 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4896 5034
4897 BUG_ON(!mddev); 5035 BUG_ON(!mddev);
5036 atomic_dec(&mddev->openers);
4898 mddev_put(mddev); 5037 mddev_put(mddev);
4899 5038
4900 return 0; 5039 return 0;
@@ -5029,6 +5168,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5029 if (!mddev->pers->error_handler) 5168 if (!mddev->pers->error_handler)
5030 return; 5169 return;
5031 mddev->pers->error_handler(mddev,rdev); 5170 mddev->pers->error_handler(mddev,rdev);
5171 if (mddev->degraded)
5172 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5173 set_bit(StateChanged, &rdev->flags);
5032 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5174 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5033 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5175 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5034 md_wakeup_thread(mddev->thread); 5176 md_wakeup_thread(mddev->thread);
@@ -5258,10 +5400,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
5258 if (!list_empty(&mddev->disks)) { 5400 if (!list_empty(&mddev->disks)) {
5259 if (mddev->pers) 5401 if (mddev->pers)
5260 seq_printf(seq, "\n %llu blocks", 5402 seq_printf(seq, "\n %llu blocks",
5261 (unsigned long long)mddev->array_size); 5403 (unsigned long long)
5404 mddev->array_sectors / 2);
5262 else 5405 else
5263 seq_printf(seq, "\n %llu blocks", 5406 seq_printf(seq, "\n %llu blocks",
5264 (unsigned long long)size); 5407 (unsigned long long)size);
5265 } 5408 }
5266 if (mddev->persistent) { 5409 if (mddev->persistent) {
5267 if (mddev->major_version != 0 || 5410 if (mddev->major_version != 0 ||
@@ -5391,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
5391static int is_mddev_idle(mddev_t *mddev) 5534static int is_mddev_idle(mddev_t *mddev)
5392{ 5535{
5393 mdk_rdev_t * rdev; 5536 mdk_rdev_t * rdev;
5394 struct list_head *tmp;
5395 int idle; 5537 int idle;
5396 long curr_events; 5538 long curr_events;
5397 5539
5398 idle = 1; 5540 idle = 1;
5399 rdev_for_each(rdev, tmp, mddev) { 5541 rcu_read_lock();
5542 rdev_for_each_rcu(rdev, mddev) {
5400 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 5543 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5401 curr_events = disk_stat_read(disk, sectors[0]) + 5544 curr_events = disk_stat_read(disk, sectors[0]) +
5402 disk_stat_read(disk, sectors[1]) - 5545 disk_stat_read(disk, sectors[1]) -
@@ -5428,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
5428 idle = 0; 5571 idle = 0;
5429 } 5572 }
5430 } 5573 }
5574 rcu_read_unlock();
5431 return idle; 5575 return idle;
5432} 5576}
5433 5577
@@ -5451,6 +5595,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
5451 */ 5595 */
5452void md_write_start(mddev_t *mddev, struct bio *bi) 5596void md_write_start(mddev_t *mddev, struct bio *bi)
5453{ 5597{
5598 int did_change = 0;
5454 if (bio_data_dir(bi) != WRITE) 5599 if (bio_data_dir(bi) != WRITE)
5455 return; 5600 return;
5456 5601
@@ -5461,6 +5606,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
5461 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5606 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5462 md_wakeup_thread(mddev->thread); 5607 md_wakeup_thread(mddev->thread);
5463 md_wakeup_thread(mddev->sync_thread); 5608 md_wakeup_thread(mddev->sync_thread);
5609 did_change = 1;
5464 } 5610 }
5465 atomic_inc(&mddev->writes_pending); 5611 atomic_inc(&mddev->writes_pending);
5466 if (mddev->safemode == 1) 5612 if (mddev->safemode == 1)
@@ -5471,10 +5617,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
5471 mddev->in_sync = 0; 5617 mddev->in_sync = 0;
5472 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5618 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5473 md_wakeup_thread(mddev->thread); 5619 md_wakeup_thread(mddev->thread);
5620 did_change = 1;
5474 } 5621 }
5475 spin_unlock_irq(&mddev->write_lock); 5622 spin_unlock_irq(&mddev->write_lock);
5476 sysfs_notify(&mddev->kobj, NULL, "array_state");
5477 } 5623 }
5624 if (did_change)
5625 sysfs_notify(&mddev->kobj, NULL, "array_state");
5478 wait_event(mddev->sb_wait, 5626 wait_event(mddev->sb_wait,
5479 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 5627 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5480 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5628 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -5495,13 +5643,18 @@ void md_write_end(mddev_t *mddev)
5495 * may proceed without blocking. It is important to call this before 5643 * may proceed without blocking. It is important to call this before
5496 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5644 * attempting a GFP_KERNEL allocation while holding the mddev lock.
5497 * Must be called with mddev_lock held. 5645 * Must be called with mddev_lock held.
5646 *
5647 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
5648 * is dropped, so return -EAGAIN after notifying userspace.
5498 */ 5649 */
5499void md_allow_write(mddev_t *mddev) 5650int md_allow_write(mddev_t *mddev)
5500{ 5651{
5501 if (!mddev->pers) 5652 if (!mddev->pers)
5502 return; 5653 return 0;
5503 if (mddev->ro) 5654 if (mddev->ro)
5504 return; 5655 return 0;
5656 if (!mddev->pers->sync_request)
5657 return 0;
5505 5658
5506 spin_lock_irq(&mddev->write_lock); 5659 spin_lock_irq(&mddev->write_lock);
5507 if (mddev->in_sync) { 5660 if (mddev->in_sync) {
@@ -5512,14 +5665,14 @@ void md_allow_write(mddev_t *mddev)
5512 mddev->safemode = 1; 5665 mddev->safemode = 1;
5513 spin_unlock_irq(&mddev->write_lock); 5666 spin_unlock_irq(&mddev->write_lock);
5514 md_update_sb(mddev, 0); 5667 md_update_sb(mddev, 0);
5515
5516 sysfs_notify(&mddev->kobj, NULL, "array_state"); 5668 sysfs_notify(&mddev->kobj, NULL, "array_state");
5517 /* wait for the dirty state to be recorded in the metadata */
5518 wait_event(mddev->sb_wait,
5519 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5520 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
5521 } else 5669 } else
5522 spin_unlock_irq(&mddev->write_lock); 5670 spin_unlock_irq(&mddev->write_lock);
5671
5672 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
5673 return -EAGAIN;
5674 else
5675 return 0;
5523} 5676}
5524EXPORT_SYMBOL_GPL(md_allow_write); 5677EXPORT_SYMBOL_GPL(md_allow_write);
5525 5678
@@ -5625,9 +5778,11 @@ void md_do_sync(mddev_t *mddev)
5625 max_sectors = mddev->resync_max_sectors; 5778 max_sectors = mddev->resync_max_sectors;
5626 mddev->resync_mismatches = 0; 5779 mddev->resync_mismatches = 0;
5627 /* we don't use the checkpoint if there's a bitmap */ 5780 /* we don't use the checkpoint if there's a bitmap */
5628 if (!mddev->bitmap && 5781 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5629 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5782 j = mddev->resync_min;
5783 else if (!mddev->bitmap)
5630 j = mddev->recovery_cp; 5784 j = mddev->recovery_cp;
5785
5631 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5786 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5632 max_sectors = mddev->size << 1; 5787 max_sectors = mddev->size << 1;
5633 else { 5788 else {
@@ -5796,6 +5951,7 @@ void md_do_sync(mddev_t *mddev)
5796 5951
5797 skip: 5952 skip:
5798 mddev->curr_resync = 0; 5953 mddev->curr_resync = 0;
5954 mddev->resync_min = 0;
5799 mddev->resync_max = MaxSector; 5955 mddev->resync_max = MaxSector;
5800 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5956 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5801 wake_up(&resync_wait); 5957 wake_up(&resync_wait);
@@ -5845,7 +6001,8 @@ static int remove_and_add_spares(mddev_t *mddev)
5845 if (rdev->raid_disk < 0 6001 if (rdev->raid_disk < 0
5846 && !test_bit(Faulty, &rdev->flags)) { 6002 && !test_bit(Faulty, &rdev->flags)) {
5847 rdev->recovery_offset = 0; 6003 rdev->recovery_offset = 0;
5848 if (mddev->pers->hot_add_disk(mddev,rdev)) { 6004 if (mddev->pers->
6005 hot_add_disk(mddev, rdev) == 0) {
5849 char nm[20]; 6006 char nm[20];
5850 sprintf(nm, "rd%d", rdev->raid_disk); 6007 sprintf(nm, "rd%d", rdev->raid_disk);
5851 if (sysfs_create_link(&mddev->kobj, 6008 if (sysfs_create_link(&mddev->kobj,
@@ -5920,23 +6077,31 @@ void md_check_recovery(mddev_t *mddev)
5920 int spares = 0; 6077 int spares = 0;
5921 6078
5922 if (!mddev->external) { 6079 if (!mddev->external) {
6080 int did_change = 0;
5923 spin_lock_irq(&mddev->write_lock); 6081 spin_lock_irq(&mddev->write_lock);
5924 if (mddev->safemode && 6082 if (mddev->safemode &&
5925 !atomic_read(&mddev->writes_pending) && 6083 !atomic_read(&mddev->writes_pending) &&
5926 !mddev->in_sync && 6084 !mddev->in_sync &&
5927 mddev->recovery_cp == MaxSector) { 6085 mddev->recovery_cp == MaxSector) {
5928 mddev->in_sync = 1; 6086 mddev->in_sync = 1;
6087 did_change = 1;
5929 if (mddev->persistent) 6088 if (mddev->persistent)
5930 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6089 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5931 } 6090 }
5932 if (mddev->safemode == 1) 6091 if (mddev->safemode == 1)
5933 mddev->safemode = 0; 6092 mddev->safemode = 0;
5934 spin_unlock_irq(&mddev->write_lock); 6093 spin_unlock_irq(&mddev->write_lock);
6094 if (did_change)
6095 sysfs_notify(&mddev->kobj, NULL, "array_state");
5935 } 6096 }
5936 6097
5937 if (mddev->flags) 6098 if (mddev->flags)
5938 md_update_sb(mddev, 0); 6099 md_update_sb(mddev, 0);
5939 6100
6101 rdev_for_each(rdev, rtmp, mddev)
6102 if (test_and_clear_bit(StateChanged, &rdev->flags))
6103 sysfs_notify(&rdev->kobj, NULL, "state");
6104
5940 6105
5941 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6106 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5942 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6107 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -5951,7 +6116,9 @@ void md_check_recovery(mddev_t *mddev)
5951 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6116 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5952 /* success...*/ 6117 /* success...*/
5953 /* activate any spares */ 6118 /* activate any spares */
5954 mddev->pers->spare_active(mddev); 6119 if (mddev->pers->spare_active(mddev))
6120 sysfs_notify(&mddev->kobj, NULL,
6121 "degraded");
5955 } 6122 }
5956 md_update_sb(mddev, 1); 6123 md_update_sb(mddev, 1);
5957 6124
@@ -5965,13 +6132,18 @@ void md_check_recovery(mddev_t *mddev)
5965 mddev->recovery = 0; 6132 mddev->recovery = 0;
5966 /* flag recovery needed just to double check */ 6133 /* flag recovery needed just to double check */
5967 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6134 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6135 sysfs_notify(&mddev->kobj, NULL, "sync_action");
5968 md_new_event(mddev); 6136 md_new_event(mddev);
5969 goto unlock; 6137 goto unlock;
5970 } 6138 }
6139 /* Set RUNNING before clearing NEEDED to avoid
6140 * any transients in the value of "sync_action".
6141 */
6142 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6143 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5971 /* Clear some bits that don't mean anything, but 6144 /* Clear some bits that don't mean anything, but
5972 * might be left set 6145 * might be left set
5973 */ 6146 */
5974 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5975 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6147 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5976 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6148 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5977 6149
@@ -5989,17 +6161,19 @@ void md_check_recovery(mddev_t *mddev)
5989 /* Cannot proceed */ 6161 /* Cannot proceed */
5990 goto unlock; 6162 goto unlock;
5991 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6163 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6164 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5992 } else if ((spares = remove_and_add_spares(mddev))) { 6165 } else if ((spares = remove_and_add_spares(mddev))) {
5993 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6166 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5994 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6167 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6168 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5995 } else if (mddev->recovery_cp < MaxSector) { 6169 } else if (mddev->recovery_cp < MaxSector) {
5996 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6170 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6171 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5997 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6172 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5998 /* nothing to be done ... */ 6173 /* nothing to be done ... */
5999 goto unlock; 6174 goto unlock;
6000 6175
6001 if (mddev->pers->sync_request) { 6176 if (mddev->pers->sync_request) {
6002 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6003 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6177 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6004 /* We are adding a device or devices to an array 6178 /* We are adding a device or devices to an array
6005 * which has the bitmap stored on all devices. 6179 * which has the bitmap stored on all devices.
@@ -6018,9 +6192,16 @@ void md_check_recovery(mddev_t *mddev)
6018 mddev->recovery = 0; 6192 mddev->recovery = 0;
6019 } else 6193 } else
6020 md_wakeup_thread(mddev->sync_thread); 6194 md_wakeup_thread(mddev->sync_thread);
6195 sysfs_notify(&mddev->kobj, NULL, "sync_action");
6021 md_new_event(mddev); 6196 md_new_event(mddev);
6022 } 6197 }
6023 unlock: 6198 unlock:
6199 if (!mddev->sync_thread) {
6200 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6201 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6202 &mddev->recovery))
6203 sysfs_notify(&mddev->kobj, NULL, "sync_action");
6204 }
6024 mddev_unlock(mddev); 6205 mddev_unlock(mddev);
6025 } 6206 }
6026} 6207}
@@ -6047,7 +6228,7 @@ static int md_notify_reboot(struct notifier_block *this,
6047 6228
6048 for_each_mddev(mddev, tmp) 6229 for_each_mddev(mddev, tmp)
6049 if (mddev_trylock(mddev)) { 6230 if (mddev_trylock(mddev)) {
6050 do_md_stop (mddev, 1); 6231 do_md_stop (mddev, 1, 0);
6051 mddev_unlock(mddev); 6232 mddev_unlock(mddev);
6052 } 6233 }
6053 /* 6234 /*