diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/md.c | 38 | ||||
-rw-r--r-- | drivers/md/md.h | 3 | ||||
-rw-r--r-- | drivers/md/raid0.c | 21 | ||||
-rw-r--r-- | drivers/md/raid0.h | 3 | ||||
-rw-r--r-- | drivers/md/raid10.c | 46 | ||||
-rw-r--r-- | drivers/md/raid10.h | 5 | ||||
-rw-r--r-- | drivers/md/raid5.c | 150 |
7 files changed, 187 insertions, 79 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 46b3a044eadf..cb20d0b0555a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) | |||
2087 | /* First make sure individual recovery_offsets are correct */ | 2087 | /* First make sure individual recovery_offsets are correct */ |
2088 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2088 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2089 | if (rdev->raid_disk >= 0 && | 2089 | if (rdev->raid_disk >= 0 && |
2090 | mddev->delta_disks >= 0 && | ||
2090 | !test_bit(In_sync, &rdev->flags) && | 2091 | !test_bit(In_sync, &rdev->flags) && |
2091 | mddev->curr_resync_completed > rdev->recovery_offset) | 2092 | mddev->curr_resync_completed > rdev->recovery_offset) |
2092 | rdev->recovery_offset = mddev->curr_resync_completed; | 2093 | rdev->recovery_offset = mddev->curr_resync_completed; |
@@ -3001,6 +3002,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3001 | return -EINVAL; | 3002 | return -EINVAL; |
3002 | } | 3003 | } |
3003 | 3004 | ||
3005 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
3006 | rdev->new_raid_disk = rdev->raid_disk; | ||
3007 | |||
3004 | /* ->takeover must set new_* and/or delta_disks | 3008 | /* ->takeover must set new_* and/or delta_disks |
3005 | * if it succeeds, and may set them when it fails. | 3009 | * if it succeeds, and may set them when it fails. |
3006 | */ | 3010 | */ |
@@ -3051,13 +3055,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3051 | mddev->safemode = 0; | 3055 | mddev->safemode = 0; |
3052 | } | 3056 | } |
3053 | 3057 | ||
3054 | module_put(mddev->pers->owner); | 3058 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
3055 | /* Invalidate devices that are now superfluous */ | 3059 | char nm[20]; |
3056 | list_for_each_entry(rdev, &mddev->disks, same_set) | 3060 | if (rdev->raid_disk < 0) |
3057 | if (rdev->raid_disk >= mddev->raid_disks) { | 3061 | continue; |
3058 | rdev->raid_disk = -1; | 3062 | if (rdev->new_raid_disk > mddev->raid_disks) |
3063 | rdev->new_raid_disk = -1; | ||
3064 | if (rdev->new_raid_disk == rdev->raid_disk) | ||
3065 | continue; | ||
3066 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
3067 | sysfs_remove_link(&mddev->kobj, nm); | ||
3068 | } | ||
3069 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
3070 | if (rdev->raid_disk < 0) | ||
3071 | continue; | ||
3072 | if (rdev->new_raid_disk == rdev->raid_disk) | ||
3073 | continue; | ||
3074 | rdev->raid_disk = rdev->new_raid_disk; | ||
3075 | if (rdev->raid_disk < 0) | ||
3059 | clear_bit(In_sync, &rdev->flags); | 3076 | clear_bit(In_sync, &rdev->flags); |
3077 | else { | ||
3078 | char nm[20]; | ||
3079 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
3080 | if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | ||
3081 | printk("md: cannot register %s for %s after level change\n", | ||
3082 | nm, mdname(mddev)); | ||
3060 | } | 3083 | } |
3084 | } | ||
3085 | |||
3086 | module_put(mddev->pers->owner); | ||
3061 | mddev->pers = pers; | 3087 | mddev->pers = pers; |
3062 | mddev->private = priv; | 3088 | mddev->private = priv; |
3063 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); | 3089 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
@@ -5895,6 +5921,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
5895 | atomic_inc(&mddev->openers); | 5921 | atomic_inc(&mddev->openers); |
5896 | mutex_unlock(&mddev->open_mutex); | 5922 | mutex_unlock(&mddev->open_mutex); |
5897 | 5923 | ||
5924 | check_disk_size_change(mddev->gendisk, bdev); | ||
5898 | out: | 5925 | out: |
5899 | return err; | 5926 | return err; |
5900 | } | 5927 | } |
@@ -6846,6 +6873,7 @@ void md_do_sync(mddev_t *mddev) | |||
6846 | rcu_read_lock(); | 6873 | rcu_read_lock(); |
6847 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | 6874 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) |
6848 | if (rdev->raid_disk >= 0 && | 6875 | if (rdev->raid_disk >= 0 && |
6876 | mddev->delta_disks >= 0 && | ||
6849 | !test_bit(Faulty, &rdev->flags) && | 6877 | !test_bit(Faulty, &rdev->flags) && |
6850 | !test_bit(In_sync, &rdev->flags) && | 6878 | !test_bit(In_sync, &rdev->flags) && |
6851 | rdev->recovery_offset < mddev->curr_resync) | 6879 | rdev->recovery_offset < mddev->curr_resync) |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 7ab5ea155452..10597bfec000 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -78,6 +78,9 @@ struct mdk_rdev_s | |||
78 | 78 | ||
79 | int desc_nr; /* descriptor index in the superblock */ | 79 | int desc_nr; /* descriptor index in the superblock */ |
80 | int raid_disk; /* role of device in array */ | 80 | int raid_disk; /* role of device in array */ |
81 | int new_raid_disk; /* role that the device will have in | ||
82 | * the array after a level-change completes. | ||
83 | */ | ||
81 | int saved_raid_disk; /* role that device used to have in the | 84 | int saved_raid_disk; /* role that device used to have in the |
82 | * array and could again if we did a partial | 85 | * array and could again if we did a partial |
83 | * resync from the bitmap | 86 | * resync from the bitmap |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e70f004c99e8..563abed5a2cb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -173,9 +173,11 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) | |||
173 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 173 | list_for_each_entry(rdev1, &mddev->disks, same_set) { |
174 | int j = rdev1->raid_disk; | 174 | int j = rdev1->raid_disk; |
175 | 175 | ||
176 | if (mddev->level == 10) | 176 | if (mddev->level == 10) { |
177 | /* taking over a raid10-n2 array */ | 177 | /* taking over a raid10-n2 array */ |
178 | j /= 2; | 178 | j /= 2; |
179 | rdev1->new_raid_disk = j; | ||
180 | } | ||
179 | 181 | ||
180 | if (j < 0 || j >= mddev->raid_disks) { | 182 | if (j < 0 || j >= mddev->raid_disks) { |
181 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " | 183 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " |
@@ -361,12 +363,6 @@ static int raid0_run(mddev_t *mddev) | |||
361 | mddev->private = conf; | 363 | mddev->private = conf; |
362 | } | 364 | } |
363 | conf = mddev->private; | 365 | conf = mddev->private; |
364 | if (conf->scale_raid_disks) { | ||
365 | int i; | ||
366 | for (i=0; i < conf->strip_zone[0].nb_dev; i++) | ||
367 | conf->devlist[i]->raid_disk /= conf->scale_raid_disks; | ||
368 | /* FIXME update sysfs rd links */ | ||
369 | } | ||
370 | 366 | ||
371 | /* calculate array device size */ | 367 | /* calculate array device size */ |
372 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); | 368 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); |
@@ -573,7 +569,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) | |||
573 | return; | 569 | return; |
574 | } | 570 | } |
575 | 571 | ||
576 | static void *raid0_takeover_raid5(mddev_t *mddev) | 572 | static void *raid0_takeover_raid45(mddev_t *mddev) |
577 | { | 573 | { |
578 | mdk_rdev_t *rdev; | 574 | mdk_rdev_t *rdev; |
579 | raid0_conf_t *priv_conf; | 575 | raid0_conf_t *priv_conf; |
@@ -596,6 +592,7 @@ static void *raid0_takeover_raid5(mddev_t *mddev) | |||
596 | 592 | ||
597 | /* Set new parameters */ | 593 | /* Set new parameters */ |
598 | mddev->new_level = 0; | 594 | mddev->new_level = 0; |
595 | mddev->new_layout = 0; | ||
599 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 596 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
600 | mddev->raid_disks--; | 597 | mddev->raid_disks--; |
601 | mddev->delta_disks = -1; | 598 | mddev->delta_disks = -1; |
@@ -635,6 +632,7 @@ static void *raid0_takeover_raid10(mddev_t *mddev) | |||
635 | 632 | ||
636 | /* Set new parameters */ | 633 | /* Set new parameters */ |
637 | mddev->new_level = 0; | 634 | mddev->new_level = 0; |
635 | mddev->new_layout = 0; | ||
638 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 636 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
639 | mddev->delta_disks = - mddev->raid_disks / 2; | 637 | mddev->delta_disks = - mddev->raid_disks / 2; |
640 | mddev->raid_disks += mddev->delta_disks; | 638 | mddev->raid_disks += mddev->delta_disks; |
@@ -643,19 +641,22 @@ static void *raid0_takeover_raid10(mddev_t *mddev) | |||
643 | mddev->recovery_cp = MaxSector; | 641 | mddev->recovery_cp = MaxSector; |
644 | 642 | ||
645 | create_strip_zones(mddev, &priv_conf); | 643 | create_strip_zones(mddev, &priv_conf); |
646 | priv_conf->scale_raid_disks = 2; | ||
647 | return priv_conf; | 644 | return priv_conf; |
648 | } | 645 | } |
649 | 646 | ||
650 | static void *raid0_takeover(mddev_t *mddev) | 647 | static void *raid0_takeover(mddev_t *mddev) |
651 | { | 648 | { |
652 | /* raid0 can take over: | 649 | /* raid0 can take over: |
650 | * raid4 - if all data disks are active. | ||
653 | * raid5 - providing it is Raid4 layout and one disk is faulty | 651 | * raid5 - providing it is Raid4 layout and one disk is faulty |
654 | * raid10 - assuming we have all necessary active disks | 652 | * raid10 - assuming we have all necessary active disks |
655 | */ | 653 | */ |
654 | if (mddev->level == 4) | ||
655 | return raid0_takeover_raid45(mddev); | ||
656 | |||
656 | if (mddev->level == 5) { | 657 | if (mddev->level == 5) { |
657 | if (mddev->layout == ALGORITHM_PARITY_N) | 658 | if (mddev->layout == ALGORITHM_PARITY_N) |
658 | return raid0_takeover_raid5(mddev); | 659 | return raid0_takeover_raid45(mddev); |
659 | 660 | ||
660 | printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", | 661 | printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", |
661 | mdname(mddev), ALGORITHM_PARITY_N); | 662 | mdname(mddev), ALGORITHM_PARITY_N); |
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index d724e664ca4d..91f8e876ee64 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h | |||
@@ -13,9 +13,6 @@ struct raid0_private_data | |||
13 | struct strip_zone *strip_zone; | 13 | struct strip_zone *strip_zone; |
14 | mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ | 14 | mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ |
15 | int nr_strip_zones; | 15 | int nr_strip_zones; |
16 | int scale_raid_disks; /* divide rdev->raid_disks by this in run() | ||
17 | * to handle conversion from raid10 | ||
18 | */ | ||
19 | }; | 16 | }; |
20 | 17 | ||
21 | typedef struct raid0_private_data raid0_conf_t; | 18 | typedef struct raid0_private_data raid0_conf_t; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 03724992cdf2..42e64e4e5e25 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -1482,14 +1482,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1482 | int sectors = r10_bio->sectors; | 1482 | int sectors = r10_bio->sectors; |
1483 | mdk_rdev_t*rdev; | 1483 | mdk_rdev_t*rdev; |
1484 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); | 1484 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); |
1485 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | ||
1485 | 1486 | ||
1486 | rcu_read_lock(); | 1487 | rcu_read_lock(); |
1487 | { | 1488 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1488 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | 1489 | if (rdev) { /* If rdev is not NULL */ |
1489 | char b[BDEVNAME_SIZE]; | 1490 | char b[BDEVNAME_SIZE]; |
1490 | int cur_read_error_count = 0; | 1491 | int cur_read_error_count = 0; |
1491 | 1492 | ||
1492 | rdev = rcu_dereference(conf->mirrors[d].rdev); | ||
1493 | bdevname(rdev->bdev, b); | 1493 | bdevname(rdev->bdev, b); |
1494 | 1494 | ||
1495 | if (test_bit(Faulty, &rdev->flags)) { | 1495 | if (test_bit(Faulty, &rdev->flags)) { |
@@ -1530,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1530 | 1530 | ||
1531 | rcu_read_lock(); | 1531 | rcu_read_lock(); |
1532 | do { | 1532 | do { |
1533 | int d = r10_bio->devs[sl].devnum; | 1533 | d = r10_bio->devs[sl].devnum; |
1534 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1534 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1535 | if (rdev && | 1535 | if (rdev && |
1536 | test_bit(In_sync, &rdev->flags)) { | 1536 | test_bit(In_sync, &rdev->flags)) { |
@@ -1564,7 +1564,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1564 | rcu_read_lock(); | 1564 | rcu_read_lock(); |
1565 | while (sl != r10_bio->read_slot) { | 1565 | while (sl != r10_bio->read_slot) { |
1566 | char b[BDEVNAME_SIZE]; | 1566 | char b[BDEVNAME_SIZE]; |
1567 | int d; | 1567 | |
1568 | if (sl==0) | 1568 | if (sl==0) |
1569 | sl = conf->copies; | 1569 | sl = conf->copies; |
1570 | sl--; | 1570 | sl--; |
@@ -1601,7 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1601 | } | 1601 | } |
1602 | sl = start; | 1602 | sl = start; |
1603 | while (sl != r10_bio->read_slot) { | 1603 | while (sl != r10_bio->read_slot) { |
1604 | int d; | 1604 | |
1605 | if (sl==0) | 1605 | if (sl==0) |
1606 | sl = conf->copies; | 1606 | sl = conf->copies; |
1607 | sl--; | 1607 | sl--; |
@@ -2161,22 +2161,22 @@ static conf_t *setup_conf(mddev_t *mddev) | |||
2161 | sector_t stride, size; | 2161 | sector_t stride, size; |
2162 | int err = -EINVAL; | 2162 | int err = -EINVAL; |
2163 | 2163 | ||
2164 | if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || | 2164 | if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || |
2165 | !is_power_of_2(mddev->chunk_sectors)) { | 2165 | !is_power_of_2(mddev->new_chunk_sectors)) { |
2166 | printk(KERN_ERR "md/raid10:%s: chunk size must be " | 2166 | printk(KERN_ERR "md/raid10:%s: chunk size must be " |
2167 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", | 2167 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", |
2168 | mdname(mddev), PAGE_SIZE); | 2168 | mdname(mddev), PAGE_SIZE); |
2169 | goto out; | 2169 | goto out; |
2170 | } | 2170 | } |
2171 | 2171 | ||
2172 | nc = mddev->layout & 255; | 2172 | nc = mddev->new_layout & 255; |
2173 | fc = (mddev->layout >> 8) & 255; | 2173 | fc = (mddev->new_layout >> 8) & 255; |
2174 | fo = mddev->layout & (1<<16); | 2174 | fo = mddev->new_layout & (1<<16); |
2175 | 2175 | ||
2176 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || | 2176 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || |
2177 | (mddev->layout >> 17)) { | 2177 | (mddev->new_layout >> 17)) { |
2178 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", | 2178 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", |
2179 | mdname(mddev), mddev->layout); | 2179 | mdname(mddev), mddev->new_layout); |
2180 | goto out; | 2180 | goto out; |
2181 | } | 2181 | } |
2182 | 2182 | ||
@@ -2241,7 +2241,6 @@ static conf_t *setup_conf(mddev_t *mddev) | |||
2241 | if (!conf->thread) | 2241 | if (!conf->thread) |
2242 | goto out; | 2242 | goto out; |
2243 | 2243 | ||
2244 | conf->scale_disks = 0; | ||
2245 | conf->mddev = mddev; | 2244 | conf->mddev = mddev; |
2246 | return conf; | 2245 | return conf; |
2247 | 2246 | ||
@@ -2300,11 +2299,6 @@ static int run(mddev_t *mddev) | |||
2300 | if (disk_idx >= conf->raid_disks | 2299 | if (disk_idx >= conf->raid_disks |
2301 | || disk_idx < 0) | 2300 | || disk_idx < 0) |
2302 | continue; | 2301 | continue; |
2303 | if (conf->scale_disks) { | ||
2304 | disk_idx *= conf->scale_disks; | ||
2305 | rdev->raid_disk = disk_idx; | ||
2306 | /* MOVE 'rd%d' link !! */ | ||
2307 | } | ||
2308 | disk = conf->mirrors + disk_idx; | 2302 | disk = conf->mirrors + disk_idx; |
2309 | 2303 | ||
2310 | disk->rdev = rdev; | 2304 | disk->rdev = rdev; |
@@ -2435,26 +2429,22 @@ static void *raid10_takeover_raid0(mddev_t *mddev) | |||
2435 | return ERR_PTR(-EINVAL); | 2429 | return ERR_PTR(-EINVAL); |
2436 | } | 2430 | } |
2437 | 2431 | ||
2438 | /* Update slot numbers to obtain | ||
2439 | * degraded raid10 with missing mirrors | ||
2440 | */ | ||
2441 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2442 | rdev->raid_disk *= 2; | ||
2443 | } | ||
2444 | |||
2445 | /* Set new parameters */ | 2432 | /* Set new parameters */ |
2446 | mddev->new_level = 10; | 2433 | mddev->new_level = 10; |
2447 | /* new layout: far_copies = 1, near_copies = 2 */ | 2434 | /* new layout: far_copies = 1, near_copies = 2 */ |
2448 | mddev->new_layout = (1<<8) + 2; | 2435 | mddev->new_layout = (1<<8) + 2; |
2449 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 2436 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
2450 | mddev->delta_disks = mddev->raid_disks; | 2437 | mddev->delta_disks = mddev->raid_disks; |
2451 | mddev->degraded = mddev->raid_disks; | ||
2452 | mddev->raid_disks *= 2; | 2438 | mddev->raid_disks *= 2; |
2453 | /* make sure it will be not marked as dirty */ | 2439 | /* make sure it will be not marked as dirty */ |
2454 | mddev->recovery_cp = MaxSector; | 2440 | mddev->recovery_cp = MaxSector; |
2455 | 2441 | ||
2456 | conf = setup_conf(mddev); | 2442 | conf = setup_conf(mddev); |
2457 | conf->scale_disks = 2; | 2443 | if (!IS_ERR(conf)) |
2444 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
2445 | if (rdev->raid_disk >= 0) | ||
2446 | rdev->new_raid_disk = rdev->raid_disk * 2; | ||
2447 | |||
2458 | return conf; | 2448 | return conf; |
2459 | } | 2449 | } |
2460 | 2450 | ||
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3824a087e17c..2316ac2e8e21 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -38,11 +38,6 @@ struct r10_private_data_s { | |||
38 | int chunk_shift; /* shift from chunks to sectors */ | 38 | int chunk_shift; /* shift from chunks to sectors */ |
39 | sector_t chunk_mask; | 39 | sector_t chunk_mask; |
40 | 40 | ||
41 | int scale_disks; /* When starting array, multiply | ||
42 | * each ->raid_disk by this. | ||
43 | * Need for raid0->raid10 migration | ||
44 | */ | ||
45 | |||
46 | struct list_head retry_list; | 41 | struct list_head retry_list; |
47 | /* queue pending writes and submit them on unplug */ | 42 | /* queue pending writes and submit them on unplug */ |
48 | struct bio_list pending_bio_list; | 43 | struct bio_list pending_bio_list; |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d2c0f94fa37d..96c690279fc6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -277,12 +277,13 @@ out: | |||
277 | return sh; | 277 | return sh; |
278 | } | 278 | } |
279 | 279 | ||
280 | static void shrink_buffers(struct stripe_head *sh, int num) | 280 | static void shrink_buffers(struct stripe_head *sh) |
281 | { | 281 | { |
282 | struct page *p; | 282 | struct page *p; |
283 | int i; | 283 | int i; |
284 | int num = sh->raid_conf->pool_size; | ||
284 | 285 | ||
285 | for (i=0; i<num ; i++) { | 286 | for (i = 0; i < num ; i++) { |
286 | p = sh->dev[i].page; | 287 | p = sh->dev[i].page; |
287 | if (!p) | 288 | if (!p) |
288 | continue; | 289 | continue; |
@@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) | |||
291 | } | 292 | } |
292 | } | 293 | } |
293 | 294 | ||
294 | static int grow_buffers(struct stripe_head *sh, int num) | 295 | static int grow_buffers(struct stripe_head *sh) |
295 | { | 296 | { |
296 | int i; | 297 | int i; |
298 | int num = sh->raid_conf->pool_size; | ||
297 | 299 | ||
298 | for (i=0; i<num; i++) { | 300 | for (i = 0; i < num; i++) { |
299 | struct page *page; | 301 | struct page *page; |
300 | 302 | ||
301 | if (!(page = alloc_page(GFP_KERNEL))) { | 303 | if (!(page = alloc_page(GFP_KERNEL))) { |
@@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, | |||
364 | return NULL; | 366 | return NULL; |
365 | } | 367 | } |
366 | 368 | ||
369 | /* | ||
370 | * Need to check if array has failed when deciding whether to: | ||
371 | * - start an array | ||
372 | * - remove non-faulty devices | ||
373 | * - add a spare | ||
374 | * - allow a reshape | ||
375 | * This determination is simple when no reshape is happening. | ||
376 | * However if there is a reshape, we need to carefully check | ||
377 | * both the before and after sections. | ||
378 | * This is because some failed devices may only affect one | ||
379 | * of the two sections, and some non-in_sync devices may | ||
380 | * be insync in the section most affected by failed devices. | ||
381 | */ | ||
382 | static int has_failed(raid5_conf_t *conf) | ||
383 | { | ||
384 | int degraded; | ||
385 | int i; | ||
386 | if (conf->mddev->reshape_position == MaxSector) | ||
387 | return conf->mddev->degraded > conf->max_degraded; | ||
388 | |||
389 | rcu_read_lock(); | ||
390 | degraded = 0; | ||
391 | for (i = 0; i < conf->previous_raid_disks; i++) { | ||
392 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
393 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
394 | degraded++; | ||
395 | else if (test_bit(In_sync, &rdev->flags)) | ||
396 | ; | ||
397 | else | ||
398 | /* not in-sync or faulty. | ||
399 | * If the reshape increases the number of devices, | ||
400 | * this is being recovered by the reshape, so | ||
401 | * this 'previous' section is not in_sync. | ||
402 | * If the number of devices is being reduced however, | ||
403 | * the device can only be part of the array if | ||
404 | * we are reverting a reshape, so this section will | ||
405 | * be in-sync. | ||
406 | */ | ||
407 | if (conf->raid_disks >= conf->previous_raid_disks) | ||
408 | degraded++; | ||
409 | } | ||
410 | rcu_read_unlock(); | ||
411 | if (degraded > conf->max_degraded) | ||
412 | return 1; | ||
413 | rcu_read_lock(); | ||
414 | degraded = 0; | ||
415 | for (i = 0; i < conf->raid_disks; i++) { | ||
416 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
417 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
418 | degraded++; | ||
419 | else if (test_bit(In_sync, &rdev->flags)) | ||
420 | ; | ||
421 | else | ||
422 | /* not in-sync or faulty. | ||
423 | * If reshape increases the number of devices, this | ||
424 | * section has already been recovered, else it | ||
425 | * almost certainly hasn't. | ||
426 | */ | ||
427 | if (conf->raid_disks <= conf->previous_raid_disks) | ||
428 | degraded++; | ||
429 | } | ||
430 | rcu_read_unlock(); | ||
431 | if (degraded > conf->max_degraded) | ||
432 | return 1; | ||
433 | return 0; | ||
434 | } | ||
435 | |||
367 | static void unplug_slaves(mddev_t *mddev); | 436 | static void unplug_slaves(mddev_t *mddev); |
368 | static void raid5_unplug_device(struct request_queue *q); | 437 | static void raid5_unplug_device(struct request_queue *q); |
369 | 438 | ||
@@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1240 | static int grow_one_stripe(raid5_conf_t *conf) | 1309 | static int grow_one_stripe(raid5_conf_t *conf) |
1241 | { | 1310 | { |
1242 | struct stripe_head *sh; | 1311 | struct stripe_head *sh; |
1243 | int disks = max(conf->raid_disks, conf->previous_raid_disks); | ||
1244 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); | 1312 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); |
1245 | if (!sh) | 1313 | if (!sh) |
1246 | return 0; | 1314 | return 0; |
1247 | memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); | 1315 | memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); |
1248 | sh->raid_conf = conf; | 1316 | sh->raid_conf = conf; |
1249 | spin_lock_init(&sh->lock); | 1317 | spin_lock_init(&sh->lock); |
1250 | #ifdef CONFIG_MULTICORE_RAID456 | 1318 | #ifdef CONFIG_MULTICORE_RAID456 |
1251 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1319 | init_waitqueue_head(&sh->ops.wait_for_ops); |
1252 | #endif | 1320 | #endif |
1253 | 1321 | ||
1254 | if (grow_buffers(sh, disks)) { | 1322 | if (grow_buffers(sh)) { |
1255 | shrink_buffers(sh, disks); | 1323 | shrink_buffers(sh); |
1256 | kmem_cache_free(conf->slab_cache, sh); | 1324 | kmem_cache_free(conf->slab_cache, sh); |
1257 | return 0; | 1325 | return 0; |
1258 | } | 1326 | } |
@@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf) | |||
1468 | if (!sh) | 1536 | if (!sh) |
1469 | return 0; | 1537 | return 0; |
1470 | BUG_ON(atomic_read(&sh->count)); | 1538 | BUG_ON(atomic_read(&sh->count)); |
1471 | shrink_buffers(sh, conf->pool_size); | 1539 | shrink_buffers(sh); |
1472 | kmem_cache_free(conf->slab_cache, sh); | 1540 | kmem_cache_free(conf->slab_cache, sh); |
1473 | atomic_dec(&conf->active_stripes); | 1541 | atomic_dec(&conf->active_stripes); |
1474 | return 1; | 1542 | return 1; |
@@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2963 | mdk_rdev_t *rdev; | 3031 | mdk_rdev_t *rdev; |
2964 | 3032 | ||
2965 | dev = &sh->dev[i]; | 3033 | dev = &sh->dev[i]; |
2966 | clear_bit(R5_Insync, &dev->flags); | ||
2967 | 3034 | ||
2968 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 3035 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " |
2969 | "written %p\n", i, dev->flags, dev->toread, dev->read, | 3036 | "written %p\n", i, dev->flags, dev->toread, dev->read, |
@@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3000 | blocked_rdev = rdev; | 3067 | blocked_rdev = rdev; |
3001 | atomic_inc(&rdev->nr_pending); | 3068 | atomic_inc(&rdev->nr_pending); |
3002 | } | 3069 | } |
3003 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3070 | clear_bit(R5_Insync, &dev->flags); |
3071 | if (!rdev) | ||
3072 | /* Not in-sync */; | ||
3073 | else if (test_bit(In_sync, &rdev->flags)) | ||
3074 | set_bit(R5_Insync, &dev->flags); | ||
3075 | else { | ||
3076 | /* could be in-sync depending on recovery/reshape status */ | ||
3077 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
3078 | set_bit(R5_Insync, &dev->flags); | ||
3079 | } | ||
3080 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3004 | /* The ReadError flag will just be confusing now */ | 3081 | /* The ReadError flag will just be confusing now */ |
3005 | clear_bit(R5_ReadError, &dev->flags); | 3082 | clear_bit(R5_ReadError, &dev->flags); |
3006 | clear_bit(R5_ReWrite, &dev->flags); | 3083 | clear_bit(R5_ReWrite, &dev->flags); |
3007 | } | 3084 | } |
3008 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3085 | if (test_bit(R5_ReadError, &dev->flags)) |
3009 | || test_bit(R5_ReadError, &dev->flags)) { | 3086 | clear_bit(R5_Insync, &dev->flags); |
3087 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3010 | s.failed++; | 3088 | s.failed++; |
3011 | s.failed_num = i; | 3089 | s.failed_num = i; |
3012 | } else | 3090 | } |
3013 | set_bit(R5_Insync, &dev->flags); | ||
3014 | } | 3091 | } |
3015 | rcu_read_unlock(); | 3092 | rcu_read_unlock(); |
3016 | 3093 | ||
@@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3244 | for (i=disks; i--; ) { | 3321 | for (i=disks; i--; ) { |
3245 | mdk_rdev_t *rdev; | 3322 | mdk_rdev_t *rdev; |
3246 | dev = &sh->dev[i]; | 3323 | dev = &sh->dev[i]; |
3247 | clear_bit(R5_Insync, &dev->flags); | ||
3248 | 3324 | ||
3249 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3325 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3250 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3326 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
@@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3282 | blocked_rdev = rdev; | 3358 | blocked_rdev = rdev; |
3283 | atomic_inc(&rdev->nr_pending); | 3359 | atomic_inc(&rdev->nr_pending); |
3284 | } | 3360 | } |
3285 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3361 | clear_bit(R5_Insync, &dev->flags); |
3362 | if (!rdev) | ||
3363 | /* Not in-sync */; | ||
3364 | else if (test_bit(In_sync, &rdev->flags)) | ||
3365 | set_bit(R5_Insync, &dev->flags); | ||
3366 | else { | ||
3367 | /* in sync if before recovery_offset */ | ||
3368 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
3369 | set_bit(R5_Insync, &dev->flags); | ||
3370 | } | ||
3371 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3286 | /* The ReadError flag will just be confusing now */ | 3372 | /* The ReadError flag will just be confusing now */ |
3287 | clear_bit(R5_ReadError, &dev->flags); | 3373 | clear_bit(R5_ReadError, &dev->flags); |
3288 | clear_bit(R5_ReWrite, &dev->flags); | 3374 | clear_bit(R5_ReWrite, &dev->flags); |
3289 | } | 3375 | } |
3290 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3376 | if (test_bit(R5_ReadError, &dev->flags)) |
3291 | || test_bit(R5_ReadError, &dev->flags)) { | 3377 | clear_bit(R5_Insync, &dev->flags); |
3378 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3292 | if (s.failed < 2) | 3379 | if (s.failed < 2) |
3293 | r6s.failed_num[s.failed] = i; | 3380 | r6s.failed_num[s.failed] = i; |
3294 | s.failed++; | 3381 | s.failed++; |
3295 | } else | 3382 | } |
3296 | set_bit(R5_Insync, &dev->flags); | ||
3297 | } | 3383 | } |
3298 | rcu_read_unlock(); | 3384 | rcu_read_unlock(); |
3299 | 3385 | ||
@@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev) | |||
4971 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5057 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
4972 | if (rdev->raid_disk < 0) | 5058 | if (rdev->raid_disk < 0) |
4973 | continue; | 5059 | continue; |
4974 | if (test_bit(In_sync, &rdev->flags)) | 5060 | if (test_bit(In_sync, &rdev->flags)) { |
4975 | working_disks++; | 5061 | working_disks++; |
5062 | continue; | ||
5063 | } | ||
4976 | /* This disc is not fully in-sync. However if it | 5064 | /* This disc is not fully in-sync. However if it |
4977 | * just stored parity (beyond the recovery_offset), | 5065 | * just stored parity (beyond the recovery_offset), |
4978 | * when we don't need to be concerned about the | 5066 | * when we don't need to be concerned about the |
@@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev) | |||
5005 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) | 5093 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) |
5006 | - working_disks); | 5094 | - working_disks); |
5007 | 5095 | ||
5008 | if (mddev->degraded > conf->max_degraded) { | 5096 | if (has_failed(conf)) { |
5009 | printk(KERN_ERR "md/raid:%s: not enough operational devices" | 5097 | printk(KERN_ERR "md/raid:%s: not enough operational devices" |
5010 | " (%d/%d failed)\n", | 5098 | " (%d/%d failed)\n", |
5011 | mdname(mddev), mddev->degraded, conf->raid_disks); | 5099 | mdname(mddev), mddev->degraded, conf->raid_disks); |
@@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev) | |||
5207 | for (i = 0; i < conf->raid_disks; i++) { | 5295 | for (i = 0; i < conf->raid_disks; i++) { |
5208 | tmp = conf->disks + i; | 5296 | tmp = conf->disks + i; |
5209 | if (tmp->rdev | 5297 | if (tmp->rdev |
5298 | && tmp->rdev->recovery_offset == MaxSector | ||
5210 | && !test_bit(Faulty, &tmp->rdev->flags) | 5299 | && !test_bit(Faulty, &tmp->rdev->flags) |
5211 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 5300 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
5212 | unsigned long flags; | 5301 | unsigned long flags; |
@@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
5242 | * isn't possible. | 5331 | * isn't possible. |
5243 | */ | 5332 | */ |
5244 | if (!test_bit(Faulty, &rdev->flags) && | 5333 | if (!test_bit(Faulty, &rdev->flags) && |
5245 | mddev->degraded <= conf->max_degraded && | 5334 | !has_failed(conf) && |
5246 | number < conf->raid_disks) { | 5335 | number < conf->raid_disks) { |
5247 | err = -EBUSY; | 5336 | err = -EBUSY; |
5248 | goto abort; | 5337 | goto abort; |
@@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
5270 | int first = 0; | 5359 | int first = 0; |
5271 | int last = conf->raid_disks - 1; | 5360 | int last = conf->raid_disks - 1; |
5272 | 5361 | ||
5273 | if (mddev->degraded > conf->max_degraded) | 5362 | if (has_failed(conf)) |
5274 | /* no point adding a device */ | 5363 | /* no point adding a device */ |
5275 | return -EINVAL; | 5364 | return -EINVAL; |
5276 | 5365 | ||
@@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev) | |||
5362 | if (mddev->bitmap) | 5451 | if (mddev->bitmap) |
5363 | /* Cannot grow a bitmap yet */ | 5452 | /* Cannot grow a bitmap yet */ |
5364 | return -EBUSY; | 5453 | return -EBUSY; |
5365 | if (mddev->degraded > conf->max_degraded) | 5454 | if (has_failed(conf)) |
5366 | return -EINVAL; | 5455 | return -EINVAL; |
5367 | if (mddev->delta_disks < 0) { | 5456 | if (mddev->delta_disks < 0) { |
5368 | /* We might be able to shrink, but the devices must | 5457 | /* We might be able to shrink, but the devices must |
@@ -5437,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5437 | 5526 | ||
5438 | /* Add some new drives, as many as will fit. | 5527 | /* Add some new drives, as many as will fit. |
5439 | * We know there are enough to make the newly sized array work. | 5528 | * We know there are enough to make the newly sized array work. |
5529 | * Don't add devices if we are reducing the number of | ||
5530 | * devices in the array. This is because it is not possible | ||
5531 | * to correctly record the "partially reconstructed" state of | ||
5532 | * such devices during the reshape and confusion could result. | ||
5440 | */ | 5533 | */ |
5441 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5534 | if (mddev->delta_disks >= 0) |
5535 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
5442 | if (rdev->raid_disk < 0 && | 5536 | if (rdev->raid_disk < 0 && |
5443 | !test_bit(Faulty, &rdev->flags)) { | 5537 | !test_bit(Faulty, &rdev->flags)) { |
5444 | if (raid5_add_disk(mddev, rdev) == 0) { | 5538 | if (raid5_add_disk(mddev, rdev) == 0) { |
@@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5460 | } | 5554 | } |
5461 | 5555 | ||
5462 | /* When a reshape changes the number of devices, ->degraded | 5556 | /* When a reshape changes the number of devices, ->degraded |
5463 | * is measured against the large of the pre and post number of | 5557 | * is measured against the larger of the pre and post number of |
5464 | * devices.*/ | 5558 | * devices.*/ |
5465 | if (mddev->delta_disks > 0) { | 5559 | if (mddev->delta_disks > 0) { |
5466 | spin_lock_irqsave(&conf->device_lock, flags); | 5560 | spin_lock_irqsave(&conf->device_lock, flags); |