diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-06-28 01:56:32 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-06-28 01:56:32 -0400 |
| commit | 93416253073511716f7e70c06e32c3810c3deac4 (patch) | |
| tree | 7e6a4c7dab40596f6b622f0eaa4b3366ed671b79 | |
| parent | b4322e7057ca851b0a3e15f29e26806efeada100 (diff) | |
| parent | 3424bf6a772cff606fc4bc24a3639c937afb547f (diff) | |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md:
md/raid5: don't include 'spare' drives when reshaping to fewer devices.
md/raid5: add a missing 'continue' in a loop.
md/raid5: Allow recovered part of partially recovered devices to be in-sync
md/raid5: More careful check for "has array failed".
md: Don't update ->recovery_offset when reshaping an array to fewer devices.
md/raid5: avoid oops when number of devices is reduced then increased.
md: enable raid4->raid0 takeover
md: clear layout after ->raid0 takeover
md: fix raid10 takeover: use new_layout for setup_conf
md: fix handling of array level takeover that re-arranges devices.
md: raid10: Fix null pointer dereference in fix_read_error()
Restore partition detection of newly created md arrays.
| -rw-r--r-- | drivers/md/md.c | 38 | ||||
| -rw-r--r-- | drivers/md/md.h | 3 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 21 | ||||
| -rw-r--r-- | drivers/md/raid0.h | 3 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 46 | ||||
| -rw-r--r-- | drivers/md/raid10.h | 5 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 150 |
7 files changed, 187 insertions, 79 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 46b3a044eadf..cb20d0b0555a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) | |||
| 2087 | /* First make sure individual recovery_offsets are correct */ | 2087 | /* First make sure individual recovery_offsets are correct */ |
| 2088 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2088 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
| 2089 | if (rdev->raid_disk >= 0 && | 2089 | if (rdev->raid_disk >= 0 && |
| 2090 | mddev->delta_disks >= 0 && | ||
| 2090 | !test_bit(In_sync, &rdev->flags) && | 2091 | !test_bit(In_sync, &rdev->flags) && |
| 2091 | mddev->curr_resync_completed > rdev->recovery_offset) | 2092 | mddev->curr_resync_completed > rdev->recovery_offset) |
| 2092 | rdev->recovery_offset = mddev->curr_resync_completed; | 2093 | rdev->recovery_offset = mddev->curr_resync_completed; |
| @@ -3001,6 +3002,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 3001 | return -EINVAL; | 3002 | return -EINVAL; |
| 3002 | } | 3003 | } |
| 3003 | 3004 | ||
| 3005 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
| 3006 | rdev->new_raid_disk = rdev->raid_disk; | ||
| 3007 | |||
| 3004 | /* ->takeover must set new_* and/or delta_disks | 3008 | /* ->takeover must set new_* and/or delta_disks |
| 3005 | * if it succeeds, and may set them when it fails. | 3009 | * if it succeeds, and may set them when it fails. |
| 3006 | */ | 3010 | */ |
| @@ -3051,13 +3055,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 3051 | mddev->safemode = 0; | 3055 | mddev->safemode = 0; |
| 3052 | } | 3056 | } |
| 3053 | 3057 | ||
| 3054 | module_put(mddev->pers->owner); | 3058 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
| 3055 | /* Invalidate devices that are now superfluous */ | 3059 | char nm[20]; |
| 3056 | list_for_each_entry(rdev, &mddev->disks, same_set) | 3060 | if (rdev->raid_disk < 0) |
| 3057 | if (rdev->raid_disk >= mddev->raid_disks) { | 3061 | continue; |
| 3058 | rdev->raid_disk = -1; | 3062 | if (rdev->new_raid_disk > mddev->raid_disks) |
| 3063 | rdev->new_raid_disk = -1; | ||
| 3064 | if (rdev->new_raid_disk == rdev->raid_disk) | ||
| 3065 | continue; | ||
| 3066 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 3067 | sysfs_remove_link(&mddev->kobj, nm); | ||
| 3068 | } | ||
| 3069 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
| 3070 | if (rdev->raid_disk < 0) | ||
| 3071 | continue; | ||
| 3072 | if (rdev->new_raid_disk == rdev->raid_disk) | ||
| 3073 | continue; | ||
| 3074 | rdev->raid_disk = rdev->new_raid_disk; | ||
| 3075 | if (rdev->raid_disk < 0) | ||
| 3059 | clear_bit(In_sync, &rdev->flags); | 3076 | clear_bit(In_sync, &rdev->flags); |
| 3077 | else { | ||
| 3078 | char nm[20]; | ||
| 3079 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 3080 | if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | ||
| 3081 | printk("md: cannot register %s for %s after level change\n", | ||
| 3082 | nm, mdname(mddev)); | ||
| 3060 | } | 3083 | } |
| 3084 | } | ||
| 3085 | |||
| 3086 | module_put(mddev->pers->owner); | ||
| 3061 | mddev->pers = pers; | 3087 | mddev->pers = pers; |
| 3062 | mddev->private = priv; | 3088 | mddev->private = priv; |
| 3063 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); | 3089 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
| @@ -5895,6 +5921,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
| 5895 | atomic_inc(&mddev->openers); | 5921 | atomic_inc(&mddev->openers); |
| 5896 | mutex_unlock(&mddev->open_mutex); | 5922 | mutex_unlock(&mddev->open_mutex); |
| 5897 | 5923 | ||
| 5924 | check_disk_size_change(mddev->gendisk, bdev); | ||
| 5898 | out: | 5925 | out: |
| 5899 | return err; | 5926 | return err; |
| 5900 | } | 5927 | } |
| @@ -6846,6 +6873,7 @@ void md_do_sync(mddev_t *mddev) | |||
| 6846 | rcu_read_lock(); | 6873 | rcu_read_lock(); |
| 6847 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | 6874 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) |
| 6848 | if (rdev->raid_disk >= 0 && | 6875 | if (rdev->raid_disk >= 0 && |
| 6876 | mddev->delta_disks >= 0 && | ||
| 6849 | !test_bit(Faulty, &rdev->flags) && | 6877 | !test_bit(Faulty, &rdev->flags) && |
| 6850 | !test_bit(In_sync, &rdev->flags) && | 6878 | !test_bit(In_sync, &rdev->flags) && |
| 6851 | rdev->recovery_offset < mddev->curr_resync) | 6879 | rdev->recovery_offset < mddev->curr_resync) |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 7ab5ea155452..10597bfec000 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
| @@ -78,6 +78,9 @@ struct mdk_rdev_s | |||
| 78 | 78 | ||
| 79 | int desc_nr; /* descriptor index in the superblock */ | 79 | int desc_nr; /* descriptor index in the superblock */ |
| 80 | int raid_disk; /* role of device in array */ | 80 | int raid_disk; /* role of device in array */ |
| 81 | int new_raid_disk; /* role that the device will have in | ||
| 82 | * the array after a level-change completes. | ||
| 83 | */ | ||
| 81 | int saved_raid_disk; /* role that device used to have in the | 84 | int saved_raid_disk; /* role that device used to have in the |
| 82 | * array and could again if we did a partial | 85 | * array and could again if we did a partial |
| 83 | * resync from the bitmap | 86 | * resync from the bitmap |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e70f004c99e8..563abed5a2cb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
| @@ -173,9 +173,11 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) | |||
| 173 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 173 | list_for_each_entry(rdev1, &mddev->disks, same_set) { |
| 174 | int j = rdev1->raid_disk; | 174 | int j = rdev1->raid_disk; |
| 175 | 175 | ||
| 176 | if (mddev->level == 10) | 176 | if (mddev->level == 10) { |
| 177 | /* taking over a raid10-n2 array */ | 177 | /* taking over a raid10-n2 array */ |
| 178 | j /= 2; | 178 | j /= 2; |
| 179 | rdev1->new_raid_disk = j; | ||
| 180 | } | ||
| 179 | 181 | ||
| 180 | if (j < 0 || j >= mddev->raid_disks) { | 182 | if (j < 0 || j >= mddev->raid_disks) { |
| 181 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " | 183 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " |
| @@ -361,12 +363,6 @@ static int raid0_run(mddev_t *mddev) | |||
| 361 | mddev->private = conf; | 363 | mddev->private = conf; |
| 362 | } | 364 | } |
| 363 | conf = mddev->private; | 365 | conf = mddev->private; |
| 364 | if (conf->scale_raid_disks) { | ||
| 365 | int i; | ||
| 366 | for (i=0; i < conf->strip_zone[0].nb_dev; i++) | ||
| 367 | conf->devlist[i]->raid_disk /= conf->scale_raid_disks; | ||
| 368 | /* FIXME update sysfs rd links */ | ||
| 369 | } | ||
| 370 | 366 | ||
| 371 | /* calculate array device size */ | 367 | /* calculate array device size */ |
| 372 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); | 368 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); |
| @@ -573,7 +569,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) | |||
| 573 | return; | 569 | return; |
| 574 | } | 570 | } |
| 575 | 571 | ||
| 576 | static void *raid0_takeover_raid5(mddev_t *mddev) | 572 | static void *raid0_takeover_raid45(mddev_t *mddev) |
| 577 | { | 573 | { |
| 578 | mdk_rdev_t *rdev; | 574 | mdk_rdev_t *rdev; |
| 579 | raid0_conf_t *priv_conf; | 575 | raid0_conf_t *priv_conf; |
| @@ -596,6 +592,7 @@ static void *raid0_takeover_raid5(mddev_t *mddev) | |||
| 596 | 592 | ||
| 597 | /* Set new parameters */ | 593 | /* Set new parameters */ |
| 598 | mddev->new_level = 0; | 594 | mddev->new_level = 0; |
| 595 | mddev->new_layout = 0; | ||
| 599 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 596 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 600 | mddev->raid_disks--; | 597 | mddev->raid_disks--; |
| 601 | mddev->delta_disks = -1; | 598 | mddev->delta_disks = -1; |
| @@ -635,6 +632,7 @@ static void *raid0_takeover_raid10(mddev_t *mddev) | |||
| 635 | 632 | ||
| 636 | /* Set new parameters */ | 633 | /* Set new parameters */ |
| 637 | mddev->new_level = 0; | 634 | mddev->new_level = 0; |
| 635 | mddev->new_layout = 0; | ||
| 638 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 636 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 639 | mddev->delta_disks = - mddev->raid_disks / 2; | 637 | mddev->delta_disks = - mddev->raid_disks / 2; |
| 640 | mddev->raid_disks += mddev->delta_disks; | 638 | mddev->raid_disks += mddev->delta_disks; |
| @@ -643,19 +641,22 @@ static void *raid0_takeover_raid10(mddev_t *mddev) | |||
| 643 | mddev->recovery_cp = MaxSector; | 641 | mddev->recovery_cp = MaxSector; |
| 644 | 642 | ||
| 645 | create_strip_zones(mddev, &priv_conf); | 643 | create_strip_zones(mddev, &priv_conf); |
| 646 | priv_conf->scale_raid_disks = 2; | ||
| 647 | return priv_conf; | 644 | return priv_conf; |
| 648 | } | 645 | } |
| 649 | 646 | ||
| 650 | static void *raid0_takeover(mddev_t *mddev) | 647 | static void *raid0_takeover(mddev_t *mddev) |
| 651 | { | 648 | { |
| 652 | /* raid0 can take over: | 649 | /* raid0 can take over: |
| 650 | * raid4 - if all data disks are active. | ||
| 653 | * raid5 - providing it is Raid4 layout and one disk is faulty | 651 | * raid5 - providing it is Raid4 layout and one disk is faulty |
| 654 | * raid10 - assuming we have all necessary active disks | 652 | * raid10 - assuming we have all necessary active disks |
| 655 | */ | 653 | */ |
| 654 | if (mddev->level == 4) | ||
| 655 | return raid0_takeover_raid45(mddev); | ||
| 656 | |||
| 656 | if (mddev->level == 5) { | 657 | if (mddev->level == 5) { |
| 657 | if (mddev->layout == ALGORITHM_PARITY_N) | 658 | if (mddev->layout == ALGORITHM_PARITY_N) |
| 658 | return raid0_takeover_raid5(mddev); | 659 | return raid0_takeover_raid45(mddev); |
| 659 | 660 | ||
| 660 | printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", | 661 | printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", |
| 661 | mdname(mddev), ALGORITHM_PARITY_N); | 662 | mdname(mddev), ALGORITHM_PARITY_N); |
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index d724e664ca4d..91f8e876ee64 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h | |||
| @@ -13,9 +13,6 @@ struct raid0_private_data | |||
| 13 | struct strip_zone *strip_zone; | 13 | struct strip_zone *strip_zone; |
| 14 | mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ | 14 | mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ |
| 15 | int nr_strip_zones; | 15 | int nr_strip_zones; |
| 16 | int scale_raid_disks; /* divide rdev->raid_disks by this in run() | ||
| 17 | * to handle conversion from raid10 | ||
| 18 | */ | ||
| 19 | }; | 16 | }; |
| 20 | 17 | ||
| 21 | typedef struct raid0_private_data raid0_conf_t; | 18 | typedef struct raid0_private_data raid0_conf_t; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 03724992cdf2..42e64e4e5e25 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -1482,14 +1482,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1482 | int sectors = r10_bio->sectors; | 1482 | int sectors = r10_bio->sectors; |
| 1483 | mdk_rdev_t*rdev; | 1483 | mdk_rdev_t*rdev; |
| 1484 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); | 1484 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); |
| 1485 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | ||
| 1485 | 1486 | ||
| 1486 | rcu_read_lock(); | 1487 | rcu_read_lock(); |
| 1487 | { | 1488 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
| 1488 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | 1489 | if (rdev) { /* If rdev is not NULL */ |
| 1489 | char b[BDEVNAME_SIZE]; | 1490 | char b[BDEVNAME_SIZE]; |
| 1490 | int cur_read_error_count = 0; | 1491 | int cur_read_error_count = 0; |
| 1491 | 1492 | ||
| 1492 | rdev = rcu_dereference(conf->mirrors[d].rdev); | ||
| 1493 | bdevname(rdev->bdev, b); | 1493 | bdevname(rdev->bdev, b); |
| 1494 | 1494 | ||
| 1495 | if (test_bit(Faulty, &rdev->flags)) { | 1495 | if (test_bit(Faulty, &rdev->flags)) { |
| @@ -1530,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1530 | 1530 | ||
| 1531 | rcu_read_lock(); | 1531 | rcu_read_lock(); |
| 1532 | do { | 1532 | do { |
| 1533 | int d = r10_bio->devs[sl].devnum; | 1533 | d = r10_bio->devs[sl].devnum; |
| 1534 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1534 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
| 1535 | if (rdev && | 1535 | if (rdev && |
| 1536 | test_bit(In_sync, &rdev->flags)) { | 1536 | test_bit(In_sync, &rdev->flags)) { |
| @@ -1564,7 +1564,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1564 | rcu_read_lock(); | 1564 | rcu_read_lock(); |
| 1565 | while (sl != r10_bio->read_slot) { | 1565 | while (sl != r10_bio->read_slot) { |
| 1566 | char b[BDEVNAME_SIZE]; | 1566 | char b[BDEVNAME_SIZE]; |
| 1567 | int d; | 1567 | |
| 1568 | if (sl==0) | 1568 | if (sl==0) |
| 1569 | sl = conf->copies; | 1569 | sl = conf->copies; |
| 1570 | sl--; | 1570 | sl--; |
| @@ -1601,7 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1601 | } | 1601 | } |
| 1602 | sl = start; | 1602 | sl = start; |
| 1603 | while (sl != r10_bio->read_slot) { | 1603 | while (sl != r10_bio->read_slot) { |
| 1604 | int d; | 1604 | |
| 1605 | if (sl==0) | 1605 | if (sl==0) |
| 1606 | sl = conf->copies; | 1606 | sl = conf->copies; |
| 1607 | sl--; | 1607 | sl--; |
| @@ -2161,22 +2161,22 @@ static conf_t *setup_conf(mddev_t *mddev) | |||
| 2161 | sector_t stride, size; | 2161 | sector_t stride, size; |
| 2162 | int err = -EINVAL; | 2162 | int err = -EINVAL; |
| 2163 | 2163 | ||
| 2164 | if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || | 2164 | if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || |
| 2165 | !is_power_of_2(mddev->chunk_sectors)) { | 2165 | !is_power_of_2(mddev->new_chunk_sectors)) { |
| 2166 | printk(KERN_ERR "md/raid10:%s: chunk size must be " | 2166 | printk(KERN_ERR "md/raid10:%s: chunk size must be " |
| 2167 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", | 2167 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", |
| 2168 | mdname(mddev), PAGE_SIZE); | 2168 | mdname(mddev), PAGE_SIZE); |
| 2169 | goto out; | 2169 | goto out; |
| 2170 | } | 2170 | } |
| 2171 | 2171 | ||
| 2172 | nc = mddev->layout & 255; | 2172 | nc = mddev->new_layout & 255; |
| 2173 | fc = (mddev->layout >> 8) & 255; | 2173 | fc = (mddev->new_layout >> 8) & 255; |
| 2174 | fo = mddev->layout & (1<<16); | 2174 | fo = mddev->new_layout & (1<<16); |
| 2175 | 2175 | ||
| 2176 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || | 2176 | if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || |
| 2177 | (mddev->layout >> 17)) { | 2177 | (mddev->new_layout >> 17)) { |
| 2178 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", | 2178 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", |
| 2179 | mdname(mddev), mddev->layout); | 2179 | mdname(mddev), mddev->new_layout); |
| 2180 | goto out; | 2180 | goto out; |
| 2181 | } | 2181 | } |
| 2182 | 2182 | ||
| @@ -2241,7 +2241,6 @@ static conf_t *setup_conf(mddev_t *mddev) | |||
| 2241 | if (!conf->thread) | 2241 | if (!conf->thread) |
| 2242 | goto out; | 2242 | goto out; |
| 2243 | 2243 | ||
| 2244 | conf->scale_disks = 0; | ||
| 2245 | conf->mddev = mddev; | 2244 | conf->mddev = mddev; |
| 2246 | return conf; | 2245 | return conf; |
| 2247 | 2246 | ||
| @@ -2300,11 +2299,6 @@ static int run(mddev_t *mddev) | |||
| 2300 | if (disk_idx >= conf->raid_disks | 2299 | if (disk_idx >= conf->raid_disks |
| 2301 | || disk_idx < 0) | 2300 | || disk_idx < 0) |
| 2302 | continue; | 2301 | continue; |
| 2303 | if (conf->scale_disks) { | ||
| 2304 | disk_idx *= conf->scale_disks; | ||
| 2305 | rdev->raid_disk = disk_idx; | ||
| 2306 | /* MOVE 'rd%d' link !! */ | ||
| 2307 | } | ||
| 2308 | disk = conf->mirrors + disk_idx; | 2302 | disk = conf->mirrors + disk_idx; |
| 2309 | 2303 | ||
| 2310 | disk->rdev = rdev; | 2304 | disk->rdev = rdev; |
| @@ -2435,26 +2429,22 @@ static void *raid10_takeover_raid0(mddev_t *mddev) | |||
| 2435 | return ERR_PTR(-EINVAL); | 2429 | return ERR_PTR(-EINVAL); |
| 2436 | } | 2430 | } |
| 2437 | 2431 | ||
| 2438 | /* Update slot numbers to obtain | ||
| 2439 | * degraded raid10 with missing mirrors | ||
| 2440 | */ | ||
| 2441 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
| 2442 | rdev->raid_disk *= 2; | ||
| 2443 | } | ||
| 2444 | |||
| 2445 | /* Set new parameters */ | 2432 | /* Set new parameters */ |
| 2446 | mddev->new_level = 10; | 2433 | mddev->new_level = 10; |
| 2447 | /* new layout: far_copies = 1, near_copies = 2 */ | 2434 | /* new layout: far_copies = 1, near_copies = 2 */ |
| 2448 | mddev->new_layout = (1<<8) + 2; | 2435 | mddev->new_layout = (1<<8) + 2; |
| 2449 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 2436 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
| 2450 | mddev->delta_disks = mddev->raid_disks; | 2437 | mddev->delta_disks = mddev->raid_disks; |
| 2451 | mddev->degraded = mddev->raid_disks; | ||
| 2452 | mddev->raid_disks *= 2; | 2438 | mddev->raid_disks *= 2; |
| 2453 | /* make sure it will be not marked as dirty */ | 2439 | /* make sure it will be not marked as dirty */ |
| 2454 | mddev->recovery_cp = MaxSector; | 2440 | mddev->recovery_cp = MaxSector; |
| 2455 | 2441 | ||
| 2456 | conf = setup_conf(mddev); | 2442 | conf = setup_conf(mddev); |
| 2457 | conf->scale_disks = 2; | 2443 | if (!IS_ERR(conf)) |
| 2444 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
| 2445 | if (rdev->raid_disk >= 0) | ||
| 2446 | rdev->new_raid_disk = rdev->raid_disk * 2; | ||
| 2447 | |||
| 2458 | return conf; | 2448 | return conf; |
| 2459 | } | 2449 | } |
| 2460 | 2450 | ||
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3824a087e17c..2316ac2e8e21 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
| @@ -38,11 +38,6 @@ struct r10_private_data_s { | |||
| 38 | int chunk_shift; /* shift from chunks to sectors */ | 38 | int chunk_shift; /* shift from chunks to sectors */ |
| 39 | sector_t chunk_mask; | 39 | sector_t chunk_mask; |
| 40 | 40 | ||
| 41 | int scale_disks; /* When starting array, multiply | ||
| 42 | * each ->raid_disk by this. | ||
| 43 | * Need for raid0->raid10 migration | ||
| 44 | */ | ||
| 45 | |||
| 46 | struct list_head retry_list; | 41 | struct list_head retry_list; |
| 47 | /* queue pending writes and submit them on unplug */ | 42 | /* queue pending writes and submit them on unplug */ |
| 48 | struct bio_list pending_bio_list; | 43 | struct bio_list pending_bio_list; |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d2c0f94fa37d..96c690279fc6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -277,12 +277,13 @@ out: | |||
| 277 | return sh; | 277 | return sh; |
| 278 | } | 278 | } |
| 279 | 279 | ||
| 280 | static void shrink_buffers(struct stripe_head *sh, int num) | 280 | static void shrink_buffers(struct stripe_head *sh) |
| 281 | { | 281 | { |
| 282 | struct page *p; | 282 | struct page *p; |
| 283 | int i; | 283 | int i; |
| 284 | int num = sh->raid_conf->pool_size; | ||
| 284 | 285 | ||
| 285 | for (i=0; i<num ; i++) { | 286 | for (i = 0; i < num ; i++) { |
| 286 | p = sh->dev[i].page; | 287 | p = sh->dev[i].page; |
| 287 | if (!p) | 288 | if (!p) |
| 288 | continue; | 289 | continue; |
| @@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) | |||
| 291 | } | 292 | } |
| 292 | } | 293 | } |
| 293 | 294 | ||
| 294 | static int grow_buffers(struct stripe_head *sh, int num) | 295 | static int grow_buffers(struct stripe_head *sh) |
| 295 | { | 296 | { |
| 296 | int i; | 297 | int i; |
| 298 | int num = sh->raid_conf->pool_size; | ||
| 297 | 299 | ||
| 298 | for (i=0; i<num; i++) { | 300 | for (i = 0; i < num; i++) { |
| 299 | struct page *page; | 301 | struct page *page; |
| 300 | 302 | ||
| 301 | if (!(page = alloc_page(GFP_KERNEL))) { | 303 | if (!(page = alloc_page(GFP_KERNEL))) { |
| @@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, | |||
| 364 | return NULL; | 366 | return NULL; |
| 365 | } | 367 | } |
| 366 | 368 | ||
| 369 | /* | ||
| 370 | * Need to check if array has failed when deciding whether to: | ||
| 371 | * - start an array | ||
| 372 | * - remove non-faulty devices | ||
| 373 | * - add a spare | ||
| 374 | * - allow a reshape | ||
| 375 | * This determination is simple when no reshape is happening. | ||
| 376 | * However if there is a reshape, we need to carefully check | ||
| 377 | * both the before and after sections. | ||
| 378 | * This is because some failed devices may only affect one | ||
| 379 | * of the two sections, and some non-in_sync devices may | ||
| 380 | * be insync in the section most affected by failed devices. | ||
| 381 | */ | ||
| 382 | static int has_failed(raid5_conf_t *conf) | ||
| 383 | { | ||
| 384 | int degraded; | ||
| 385 | int i; | ||
| 386 | if (conf->mddev->reshape_position == MaxSector) | ||
| 387 | return conf->mddev->degraded > conf->max_degraded; | ||
| 388 | |||
| 389 | rcu_read_lock(); | ||
| 390 | degraded = 0; | ||
| 391 | for (i = 0; i < conf->previous_raid_disks; i++) { | ||
| 392 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
| 393 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
| 394 | degraded++; | ||
| 395 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 396 | ; | ||
| 397 | else | ||
| 398 | /* not in-sync or faulty. | ||
| 399 | * If the reshape increases the number of devices, | ||
| 400 | * this is being recovered by the reshape, so | ||
| 401 | * this 'previous' section is not in_sync. | ||
| 402 | * If the number of devices is being reduced however, | ||
| 403 | * the device can only be part of the array if | ||
| 404 | * we are reverting a reshape, so this section will | ||
| 405 | * be in-sync. | ||
| 406 | */ | ||
| 407 | if (conf->raid_disks >= conf->previous_raid_disks) | ||
| 408 | degraded++; | ||
| 409 | } | ||
| 410 | rcu_read_unlock(); | ||
| 411 | if (degraded > conf->max_degraded) | ||
| 412 | return 1; | ||
| 413 | rcu_read_lock(); | ||
| 414 | degraded = 0; | ||
| 415 | for (i = 0; i < conf->raid_disks; i++) { | ||
| 416 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
| 417 | if (!rdev || test_bit(Faulty, &rdev->flags)) | ||
| 418 | degraded++; | ||
| 419 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 420 | ; | ||
| 421 | else | ||
| 422 | /* not in-sync or faulty. | ||
| 423 | * If reshape increases the number of devices, this | ||
| 424 | * section has already been recovered, else it | ||
| 425 | * almost certainly hasn't. | ||
| 426 | */ | ||
| 427 | if (conf->raid_disks <= conf->previous_raid_disks) | ||
| 428 | degraded++; | ||
| 429 | } | ||
| 430 | rcu_read_unlock(); | ||
| 431 | if (degraded > conf->max_degraded) | ||
| 432 | return 1; | ||
| 433 | return 0; | ||
| 434 | } | ||
| 435 | |||
| 367 | static void unplug_slaves(mddev_t *mddev); | 436 | static void unplug_slaves(mddev_t *mddev); |
| 368 | static void raid5_unplug_device(struct request_queue *q); | 437 | static void raid5_unplug_device(struct request_queue *q); |
| 369 | 438 | ||
| @@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
| 1240 | static int grow_one_stripe(raid5_conf_t *conf) | 1309 | static int grow_one_stripe(raid5_conf_t *conf) |
| 1241 | { | 1310 | { |
| 1242 | struct stripe_head *sh; | 1311 | struct stripe_head *sh; |
| 1243 | int disks = max(conf->raid_disks, conf->previous_raid_disks); | ||
| 1244 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); | 1312 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); |
| 1245 | if (!sh) | 1313 | if (!sh) |
| 1246 | return 0; | 1314 | return 0; |
| 1247 | memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); | 1315 | memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); |
| 1248 | sh->raid_conf = conf; | 1316 | sh->raid_conf = conf; |
| 1249 | spin_lock_init(&sh->lock); | 1317 | spin_lock_init(&sh->lock); |
| 1250 | #ifdef CONFIG_MULTICORE_RAID456 | 1318 | #ifdef CONFIG_MULTICORE_RAID456 |
| 1251 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1319 | init_waitqueue_head(&sh->ops.wait_for_ops); |
| 1252 | #endif | 1320 | #endif |
| 1253 | 1321 | ||
| 1254 | if (grow_buffers(sh, disks)) { | 1322 | if (grow_buffers(sh)) { |
| 1255 | shrink_buffers(sh, disks); | 1323 | shrink_buffers(sh); |
| 1256 | kmem_cache_free(conf->slab_cache, sh); | 1324 | kmem_cache_free(conf->slab_cache, sh); |
| 1257 | return 0; | 1325 | return 0; |
| 1258 | } | 1326 | } |
| @@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf) | |||
| 1468 | if (!sh) | 1536 | if (!sh) |
| 1469 | return 0; | 1537 | return 0; |
| 1470 | BUG_ON(atomic_read(&sh->count)); | 1538 | BUG_ON(atomic_read(&sh->count)); |
| 1471 | shrink_buffers(sh, conf->pool_size); | 1539 | shrink_buffers(sh); |
| 1472 | kmem_cache_free(conf->slab_cache, sh); | 1540 | kmem_cache_free(conf->slab_cache, sh); |
| 1473 | atomic_dec(&conf->active_stripes); | 1541 | atomic_dec(&conf->active_stripes); |
| 1474 | return 1; | 1542 | return 1; |
| @@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2963 | mdk_rdev_t *rdev; | 3031 | mdk_rdev_t *rdev; |
| 2964 | 3032 | ||
| 2965 | dev = &sh->dev[i]; | 3033 | dev = &sh->dev[i]; |
| 2966 | clear_bit(R5_Insync, &dev->flags); | ||
| 2967 | 3034 | ||
| 2968 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 3035 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " |
| 2969 | "written %p\n", i, dev->flags, dev->toread, dev->read, | 3036 | "written %p\n", i, dev->flags, dev->toread, dev->read, |
| @@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 3000 | blocked_rdev = rdev; | 3067 | blocked_rdev = rdev; |
| 3001 | atomic_inc(&rdev->nr_pending); | 3068 | atomic_inc(&rdev->nr_pending); |
| 3002 | } | 3069 | } |
| 3003 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3070 | clear_bit(R5_Insync, &dev->flags); |
| 3071 | if (!rdev) | ||
| 3072 | /* Not in-sync */; | ||
| 3073 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 3074 | set_bit(R5_Insync, &dev->flags); | ||
| 3075 | else { | ||
| 3076 | /* could be in-sync depending on recovery/reshape status */ | ||
| 3077 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
| 3078 | set_bit(R5_Insync, &dev->flags); | ||
| 3079 | } | ||
| 3080 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3004 | /* The ReadError flag will just be confusing now */ | 3081 | /* The ReadError flag will just be confusing now */ |
| 3005 | clear_bit(R5_ReadError, &dev->flags); | 3082 | clear_bit(R5_ReadError, &dev->flags); |
| 3006 | clear_bit(R5_ReWrite, &dev->flags); | 3083 | clear_bit(R5_ReWrite, &dev->flags); |
| 3007 | } | 3084 | } |
| 3008 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3085 | if (test_bit(R5_ReadError, &dev->flags)) |
| 3009 | || test_bit(R5_ReadError, &dev->flags)) { | 3086 | clear_bit(R5_Insync, &dev->flags); |
| 3087 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3010 | s.failed++; | 3088 | s.failed++; |
| 3011 | s.failed_num = i; | 3089 | s.failed_num = i; |
| 3012 | } else | 3090 | } |
| 3013 | set_bit(R5_Insync, &dev->flags); | ||
| 3014 | } | 3091 | } |
| 3015 | rcu_read_unlock(); | 3092 | rcu_read_unlock(); |
| 3016 | 3093 | ||
| @@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3244 | for (i=disks; i--; ) { | 3321 | for (i=disks; i--; ) { |
| 3245 | mdk_rdev_t *rdev; | 3322 | mdk_rdev_t *rdev; |
| 3246 | dev = &sh->dev[i]; | 3323 | dev = &sh->dev[i]; |
| 3247 | clear_bit(R5_Insync, &dev->flags); | ||
| 3248 | 3324 | ||
| 3249 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3325 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
| 3250 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3326 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
| @@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh) | |||
| 3282 | blocked_rdev = rdev; | 3358 | blocked_rdev = rdev; |
| 3283 | atomic_inc(&rdev->nr_pending); | 3359 | atomic_inc(&rdev->nr_pending); |
| 3284 | } | 3360 | } |
| 3285 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 3361 | clear_bit(R5_Insync, &dev->flags); |
| 3362 | if (!rdev) | ||
| 3363 | /* Not in-sync */; | ||
| 3364 | else if (test_bit(In_sync, &rdev->flags)) | ||
| 3365 | set_bit(R5_Insync, &dev->flags); | ||
| 3366 | else { | ||
| 3367 | /* in sync if before recovery_offset */ | ||
| 3368 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
| 3369 | set_bit(R5_Insync, &dev->flags); | ||
| 3370 | } | ||
| 3371 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3286 | /* The ReadError flag will just be confusing now */ | 3372 | /* The ReadError flag will just be confusing now */ |
| 3287 | clear_bit(R5_ReadError, &dev->flags); | 3373 | clear_bit(R5_ReadError, &dev->flags); |
| 3288 | clear_bit(R5_ReWrite, &dev->flags); | 3374 | clear_bit(R5_ReWrite, &dev->flags); |
| 3289 | } | 3375 | } |
| 3290 | if (!rdev || !test_bit(In_sync, &rdev->flags) | 3376 | if (test_bit(R5_ReadError, &dev->flags)) |
| 3291 | || test_bit(R5_ReadError, &dev->flags)) { | 3377 | clear_bit(R5_Insync, &dev->flags); |
| 3378 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
| 3292 | if (s.failed < 2) | 3379 | if (s.failed < 2) |
| 3293 | r6s.failed_num[s.failed] = i; | 3380 | r6s.failed_num[s.failed] = i; |
| 3294 | s.failed++; | 3381 | s.failed++; |
| 3295 | } else | 3382 | } |
| 3296 | set_bit(R5_Insync, &dev->flags); | ||
| 3297 | } | 3383 | } |
| 3298 | rcu_read_unlock(); | 3384 | rcu_read_unlock(); |
| 3299 | 3385 | ||
| @@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev) | |||
| 4971 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5057 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
| 4972 | if (rdev->raid_disk < 0) | 5058 | if (rdev->raid_disk < 0) |
| 4973 | continue; | 5059 | continue; |
| 4974 | if (test_bit(In_sync, &rdev->flags)) | 5060 | if (test_bit(In_sync, &rdev->flags)) { |
| 4975 | working_disks++; | 5061 | working_disks++; |
| 5062 | continue; | ||
| 5063 | } | ||
| 4976 | /* This disc is not fully in-sync. However if it | 5064 | /* This disc is not fully in-sync. However if it |
| 4977 | * just stored parity (beyond the recovery_offset), | 5065 | * just stored parity (beyond the recovery_offset), |
| 4978 | * when we don't need to be concerned about the | 5066 | * when we don't need to be concerned about the |
| @@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev) | |||
| 5005 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) | 5093 | mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) |
| 5006 | - working_disks); | 5094 | - working_disks); |
| 5007 | 5095 | ||
| 5008 | if (mddev->degraded > conf->max_degraded) { | 5096 | if (has_failed(conf)) { |
| 5009 | printk(KERN_ERR "md/raid:%s: not enough operational devices" | 5097 | printk(KERN_ERR "md/raid:%s: not enough operational devices" |
| 5010 | " (%d/%d failed)\n", | 5098 | " (%d/%d failed)\n", |
| 5011 | mdname(mddev), mddev->degraded, conf->raid_disks); | 5099 | mdname(mddev), mddev->degraded, conf->raid_disks); |
| @@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev) | |||
| 5207 | for (i = 0; i < conf->raid_disks; i++) { | 5295 | for (i = 0; i < conf->raid_disks; i++) { |
| 5208 | tmp = conf->disks + i; | 5296 | tmp = conf->disks + i; |
| 5209 | if (tmp->rdev | 5297 | if (tmp->rdev |
| 5298 | && tmp->rdev->recovery_offset == MaxSector | ||
| 5210 | && !test_bit(Faulty, &tmp->rdev->flags) | 5299 | && !test_bit(Faulty, &tmp->rdev->flags) |
| 5211 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 5300 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
| 5212 | unsigned long flags; | 5301 | unsigned long flags; |
| @@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
| 5242 | * isn't possible. | 5331 | * isn't possible. |
| 5243 | */ | 5332 | */ |
| 5244 | if (!test_bit(Faulty, &rdev->flags) && | 5333 | if (!test_bit(Faulty, &rdev->flags) && |
| 5245 | mddev->degraded <= conf->max_degraded && | 5334 | !has_failed(conf) && |
| 5246 | number < conf->raid_disks) { | 5335 | number < conf->raid_disks) { |
| 5247 | err = -EBUSY; | 5336 | err = -EBUSY; |
| 5248 | goto abort; | 5337 | goto abort; |
| @@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 5270 | int first = 0; | 5359 | int first = 0; |
| 5271 | int last = conf->raid_disks - 1; | 5360 | int last = conf->raid_disks - 1; |
| 5272 | 5361 | ||
| 5273 | if (mddev->degraded > conf->max_degraded) | 5362 | if (has_failed(conf)) |
| 5274 | /* no point adding a device */ | 5363 | /* no point adding a device */ |
| 5275 | return -EINVAL; | 5364 | return -EINVAL; |
| 5276 | 5365 | ||
| @@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev) | |||
| 5362 | if (mddev->bitmap) | 5451 | if (mddev->bitmap) |
| 5363 | /* Cannot grow a bitmap yet */ | 5452 | /* Cannot grow a bitmap yet */ |
| 5364 | return -EBUSY; | 5453 | return -EBUSY; |
| 5365 | if (mddev->degraded > conf->max_degraded) | 5454 | if (has_failed(conf)) |
| 5366 | return -EINVAL; | 5455 | return -EINVAL; |
| 5367 | if (mddev->delta_disks < 0) { | 5456 | if (mddev->delta_disks < 0) { |
| 5368 | /* We might be able to shrink, but the devices must | 5457 | /* We might be able to shrink, but the devices must |
| @@ -5437,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
| 5437 | 5526 | ||
| 5438 | /* Add some new drives, as many as will fit. | 5527 | /* Add some new drives, as many as will fit. |
| 5439 | * We know there are enough to make the newly sized array work. | 5528 | * We know there are enough to make the newly sized array work. |
| 5529 | * Don't add devices if we are reducing the number of | ||
| 5530 | * devices in the array. This is because it is not possible | ||
| 5531 | * to correctly record the "partially reconstructed" state of | ||
| 5532 | * such devices during the reshape and confusion could result. | ||
| 5440 | */ | 5533 | */ |
| 5441 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5534 | if (mddev->delta_disks >= 0) |
| 5535 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
| 5442 | if (rdev->raid_disk < 0 && | 5536 | if (rdev->raid_disk < 0 && |
| 5443 | !test_bit(Faulty, &rdev->flags)) { | 5537 | !test_bit(Faulty, &rdev->flags)) { |
| 5444 | if (raid5_add_disk(mddev, rdev) == 0) { | 5538 | if (raid5_add_disk(mddev, rdev) == 0) { |
| @@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
| 5460 | } | 5554 | } |
| 5461 | 5555 | ||
| 5462 | /* When a reshape changes the number of devices, ->degraded | 5556 | /* When a reshape changes the number of devices, ->degraded |
| 5463 | * is measured against the large of the pre and post number of | 5557 | * is measured against the larger of the pre and post number of |
| 5464 | * devices.*/ | 5558 | * devices.*/ |
| 5465 | if (mddev->delta_disks > 0) { | 5559 | if (mddev->delta_disks > 0) { |
| 5466 | spin_lock_irqsave(&conf->device_lock, flags); | 5560 | spin_lock_irqsave(&conf->device_lock, flags); |
