aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-06-28 01:56:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-06-28 01:56:32 -0400
commit93416253073511716f7e70c06e32c3810c3deac4 (patch)
tree7e6a4c7dab40596f6b622f0eaa4b3366ed671b79
parentb4322e7057ca851b0a3e15f29e26806efeada100 (diff)
parent3424bf6a772cff606fc4bc24a3639c937afb547f (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: md/raid5: don't include 'spare' drives when reshaping to fewer devices. md/raid5: add a missing 'continue' in a loop. md/raid5: Allow recovered part of partially recovered devices to be in-sync md/raid5: More careful check for "has array failed". md: Don't update ->recovery_offset when reshaping an array to fewer devices. md/raid5: avoid oops when number of devices is reduced then increased. md: enable raid4->raid0 takeover md: clear layout after ->raid0 takeover md: fix raid10 takeover: use new_layout for setup_conf md: fix handling of array level takeover that re-arranges devices. md: raid10: Fix null pointer dereference in fix_read_error() Restore partition detection of newly created md arrays.
-rw-r--r--drivers/md/md.c38
-rw-r--r--drivers/md/md.h3
-rw-r--r--drivers/md/raid0.c21
-rw-r--r--drivers/md/raid0.h3
-rw-r--r--drivers/md/raid10.c46
-rw-r--r--drivers/md/raid10.h5
-rw-r--r--drivers/md/raid5.c150
7 files changed, 187 insertions, 79 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 46b3a044eadf..cb20d0b0555a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
2087 /* First make sure individual recovery_offsets are correct */ 2087 /* First make sure individual recovery_offsets are correct */
2088 list_for_each_entry(rdev, &mddev->disks, same_set) { 2088 list_for_each_entry(rdev, &mddev->disks, same_set) {
2089 if (rdev->raid_disk >= 0 && 2089 if (rdev->raid_disk >= 0 &&
2090 mddev->delta_disks >= 0 &&
2090 !test_bit(In_sync, &rdev->flags) && 2091 !test_bit(In_sync, &rdev->flags) &&
2091 mddev->curr_resync_completed > rdev->recovery_offset) 2092 mddev->curr_resync_completed > rdev->recovery_offset)
2092 rdev->recovery_offset = mddev->curr_resync_completed; 2093 rdev->recovery_offset = mddev->curr_resync_completed;
@@ -3001,6 +3002,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3001 return -EINVAL; 3002 return -EINVAL;
3002 } 3003 }
3003 3004
3005 list_for_each_entry(rdev, &mddev->disks, same_set)
3006 rdev->new_raid_disk = rdev->raid_disk;
3007
3004 /* ->takeover must set new_* and/or delta_disks 3008 /* ->takeover must set new_* and/or delta_disks
3005 * if it succeeds, and may set them when it fails. 3009 * if it succeeds, and may set them when it fails.
3006 */ 3010 */
@@ -3051,13 +3055,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3051 mddev->safemode = 0; 3055 mddev->safemode = 0;
3052 } 3056 }
3053 3057
3054 module_put(mddev->pers->owner); 3058 list_for_each_entry(rdev, &mddev->disks, same_set) {
3055 /* Invalidate devices that are now superfluous */ 3059 char nm[20];
3056 list_for_each_entry(rdev, &mddev->disks, same_set) 3060 if (rdev->raid_disk < 0)
3057 if (rdev->raid_disk >= mddev->raid_disks) { 3061 continue;
3058 rdev->raid_disk = -1; 3062 if (rdev->new_raid_disk > mddev->raid_disks)
3063 rdev->new_raid_disk = -1;
3064 if (rdev->new_raid_disk == rdev->raid_disk)
3065 continue;
3066 sprintf(nm, "rd%d", rdev->raid_disk);
3067 sysfs_remove_link(&mddev->kobj, nm);
3068 }
3069 list_for_each_entry(rdev, &mddev->disks, same_set) {
3070 if (rdev->raid_disk < 0)
3071 continue;
3072 if (rdev->new_raid_disk == rdev->raid_disk)
3073 continue;
3074 rdev->raid_disk = rdev->new_raid_disk;
3075 if (rdev->raid_disk < 0)
3059 clear_bit(In_sync, &rdev->flags); 3076 clear_bit(In_sync, &rdev->flags);
3077 else {
3078 char nm[20];
3079 sprintf(nm, "rd%d", rdev->raid_disk);
3080 if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3081 printk("md: cannot register %s for %s after level change\n",
3082 nm, mdname(mddev));
3060 } 3083 }
3084 }
3085
3086 module_put(mddev->pers->owner);
3061 mddev->pers = pers; 3087 mddev->pers = pers;
3062 mddev->private = priv; 3088 mddev->private = priv;
3063 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3089 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
@@ -5895,6 +5921,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5895 atomic_inc(&mddev->openers); 5921 atomic_inc(&mddev->openers);
5896 mutex_unlock(&mddev->open_mutex); 5922 mutex_unlock(&mddev->open_mutex);
5897 5923
5924 check_disk_size_change(mddev->gendisk, bdev);
5898 out: 5925 out:
5899 return err; 5926 return err;
5900} 5927}
@@ -6846,6 +6873,7 @@ void md_do_sync(mddev_t *mddev)
6846 rcu_read_lock(); 6873 rcu_read_lock();
6847 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 6874 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6848 if (rdev->raid_disk >= 0 && 6875 if (rdev->raid_disk >= 0 &&
6876 mddev->delta_disks >= 0 &&
6849 !test_bit(Faulty, &rdev->flags) && 6877 !test_bit(Faulty, &rdev->flags) &&
6850 !test_bit(In_sync, &rdev->flags) && 6878 !test_bit(In_sync, &rdev->flags) &&
6851 rdev->recovery_offset < mddev->curr_resync) 6879 rdev->recovery_offset < mddev->curr_resync)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7ab5ea155452..10597bfec000 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -78,6 +78,9 @@ struct mdk_rdev_s
78 78
79 int desc_nr; /* descriptor index in the superblock */ 79 int desc_nr; /* descriptor index in the superblock */
80 int raid_disk; /* role of device in array */ 80 int raid_disk; /* role of device in array */
81 int new_raid_disk; /* role that the device will have in
82 * the array after a level-change completes.
83 */
81 int saved_raid_disk; /* role that device used to have in the 84 int saved_raid_disk; /* role that device used to have in the
82 * array and could again if we did a partial 85 * array and could again if we did a partial
83 * resync from the bitmap 86 * resync from the bitmap
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e70f004c99e8..563abed5a2cb 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -173,9 +173,11 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
173 list_for_each_entry(rdev1, &mddev->disks, same_set) { 173 list_for_each_entry(rdev1, &mddev->disks, same_set) {
174 int j = rdev1->raid_disk; 174 int j = rdev1->raid_disk;
175 175
176 if (mddev->level == 10) 176 if (mddev->level == 10) {
177 /* taking over a raid10-n2 array */ 177 /* taking over a raid10-n2 array */
178 j /= 2; 178 j /= 2;
179 rdev1->new_raid_disk = j;
180 }
179 181
180 if (j < 0 || j >= mddev->raid_disks) { 182 if (j < 0 || j >= mddev->raid_disks) {
181 printk(KERN_ERR "md/raid0:%s: bad disk number %d - " 183 printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
@@ -361,12 +363,6 @@ static int raid0_run(mddev_t *mddev)
361 mddev->private = conf; 363 mddev->private = conf;
362 } 364 }
363 conf = mddev->private; 365 conf = mddev->private;
364 if (conf->scale_raid_disks) {
365 int i;
366 for (i=0; i < conf->strip_zone[0].nb_dev; i++)
367 conf->devlist[i]->raid_disk /= conf->scale_raid_disks;
368 /* FIXME update sysfs rd links */
369 }
370 366
371 /* calculate array device size */ 367 /* calculate array device size */
372 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); 368 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
@@ -573,7 +569,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
573 return; 569 return;
574} 570}
575 571
576static void *raid0_takeover_raid5(mddev_t *mddev) 572static void *raid0_takeover_raid45(mddev_t *mddev)
577{ 573{
578 mdk_rdev_t *rdev; 574 mdk_rdev_t *rdev;
579 raid0_conf_t *priv_conf; 575 raid0_conf_t *priv_conf;
@@ -596,6 +592,7 @@ static void *raid0_takeover_raid5(mddev_t *mddev)
596 592
597 /* Set new parameters */ 593 /* Set new parameters */
598 mddev->new_level = 0; 594 mddev->new_level = 0;
595 mddev->new_layout = 0;
599 mddev->new_chunk_sectors = mddev->chunk_sectors; 596 mddev->new_chunk_sectors = mddev->chunk_sectors;
600 mddev->raid_disks--; 597 mddev->raid_disks--;
601 mddev->delta_disks = -1; 598 mddev->delta_disks = -1;
@@ -635,6 +632,7 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
635 632
636 /* Set new parameters */ 633 /* Set new parameters */
637 mddev->new_level = 0; 634 mddev->new_level = 0;
635 mddev->new_layout = 0;
638 mddev->new_chunk_sectors = mddev->chunk_sectors; 636 mddev->new_chunk_sectors = mddev->chunk_sectors;
639 mddev->delta_disks = - mddev->raid_disks / 2; 637 mddev->delta_disks = - mddev->raid_disks / 2;
640 mddev->raid_disks += mddev->delta_disks; 638 mddev->raid_disks += mddev->delta_disks;
@@ -643,19 +641,22 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
643 mddev->recovery_cp = MaxSector; 641 mddev->recovery_cp = MaxSector;
644 642
645 create_strip_zones(mddev, &priv_conf); 643 create_strip_zones(mddev, &priv_conf);
646 priv_conf->scale_raid_disks = 2;
647 return priv_conf; 644 return priv_conf;
648} 645}
649 646
650static void *raid0_takeover(mddev_t *mddev) 647static void *raid0_takeover(mddev_t *mddev)
651{ 648{
652 /* raid0 can take over: 649 /* raid0 can take over:
650 * raid4 - if all data disks are active.
653 * raid5 - providing it is Raid4 layout and one disk is faulty 651 * raid5 - providing it is Raid4 layout and one disk is faulty
654 * raid10 - assuming we have all necessary active disks 652 * raid10 - assuming we have all necessary active disks
655 */ 653 */
654 if (mddev->level == 4)
655 return raid0_takeover_raid45(mddev);
656
656 if (mddev->level == 5) { 657 if (mddev->level == 5) {
657 if (mddev->layout == ALGORITHM_PARITY_N) 658 if (mddev->layout == ALGORITHM_PARITY_N)
658 return raid0_takeover_raid5(mddev); 659 return raid0_takeover_raid45(mddev);
659 660
660 printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", 661 printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
661 mdname(mddev), ALGORITHM_PARITY_N); 662 mdname(mddev), ALGORITHM_PARITY_N);
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index d724e664ca4d..91f8e876ee64 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -13,9 +13,6 @@ struct raid0_private_data
13 struct strip_zone *strip_zone; 13 struct strip_zone *strip_zone;
14 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ 14 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
15 int nr_strip_zones; 15 int nr_strip_zones;
16 int scale_raid_disks; /* divide rdev->raid_disks by this in run()
17 * to handle conversion from raid10
18 */
19}; 16};
20 17
21typedef struct raid0_private_data raid0_conf_t; 18typedef struct raid0_private_data raid0_conf_t;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 03724992cdf2..42e64e4e5e25 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1482,14 +1482,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1482 int sectors = r10_bio->sectors; 1482 int sectors = r10_bio->sectors;
1483 mdk_rdev_t*rdev; 1483 mdk_rdev_t*rdev;
1484 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 1484 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1485 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1485 1486
1486 rcu_read_lock(); 1487 rcu_read_lock();
1487 { 1488 rdev = rcu_dereference(conf->mirrors[d].rdev);
1488 int d = r10_bio->devs[r10_bio->read_slot].devnum; 1489 if (rdev) { /* If rdev is not NULL */
1489 char b[BDEVNAME_SIZE]; 1490 char b[BDEVNAME_SIZE];
1490 int cur_read_error_count = 0; 1491 int cur_read_error_count = 0;
1491 1492
1492 rdev = rcu_dereference(conf->mirrors[d].rdev);
1493 bdevname(rdev->bdev, b); 1493 bdevname(rdev->bdev, b);
1494 1494
1495 if (test_bit(Faulty, &rdev->flags)) { 1495 if (test_bit(Faulty, &rdev->flags)) {
@@ -1530,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1530 1530
1531 rcu_read_lock(); 1531 rcu_read_lock();
1532 do { 1532 do {
1533 int d = r10_bio->devs[sl].devnum; 1533 d = r10_bio->devs[sl].devnum;
1534 rdev = rcu_dereference(conf->mirrors[d].rdev); 1534 rdev = rcu_dereference(conf->mirrors[d].rdev);
1535 if (rdev && 1535 if (rdev &&
1536 test_bit(In_sync, &rdev->flags)) { 1536 test_bit(In_sync, &rdev->flags)) {
@@ -1564,7 +1564,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1564 rcu_read_lock(); 1564 rcu_read_lock();
1565 while (sl != r10_bio->read_slot) { 1565 while (sl != r10_bio->read_slot) {
1566 char b[BDEVNAME_SIZE]; 1566 char b[BDEVNAME_SIZE];
1567 int d; 1567
1568 if (sl==0) 1568 if (sl==0)
1569 sl = conf->copies; 1569 sl = conf->copies;
1570 sl--; 1570 sl--;
@@ -1601,7 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1601 } 1601 }
1602 sl = start; 1602 sl = start;
1603 while (sl != r10_bio->read_slot) { 1603 while (sl != r10_bio->read_slot) {
1604 int d; 1604
1605 if (sl==0) 1605 if (sl==0)
1606 sl = conf->copies; 1606 sl = conf->copies;
1607 sl--; 1607 sl--;
@@ -2161,22 +2161,22 @@ static conf_t *setup_conf(mddev_t *mddev)
2161 sector_t stride, size; 2161 sector_t stride, size;
2162 int err = -EINVAL; 2162 int err = -EINVAL;
2163 2163
2164 if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || 2164 if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
2165 !is_power_of_2(mddev->chunk_sectors)) { 2165 !is_power_of_2(mddev->new_chunk_sectors)) {
2166 printk(KERN_ERR "md/raid10:%s: chunk size must be " 2166 printk(KERN_ERR "md/raid10:%s: chunk size must be "
2167 "at least PAGE_SIZE(%ld) and be a power of 2.\n", 2167 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
2168 mdname(mddev), PAGE_SIZE); 2168 mdname(mddev), PAGE_SIZE);
2169 goto out; 2169 goto out;
2170 } 2170 }
2171 2171
2172 nc = mddev->layout & 255; 2172 nc = mddev->new_layout & 255;
2173 fc = (mddev->layout >> 8) & 255; 2173 fc = (mddev->new_layout >> 8) & 255;
2174 fo = mddev->layout & (1<<16); 2174 fo = mddev->new_layout & (1<<16);
2175 2175
2176 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 2176 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2177 (mddev->layout >> 17)) { 2177 (mddev->new_layout >> 17)) {
2178 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 2178 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
2179 mdname(mddev), mddev->layout); 2179 mdname(mddev), mddev->new_layout);
2180 goto out; 2180 goto out;
2181 } 2181 }
2182 2182
@@ -2241,7 +2241,6 @@ static conf_t *setup_conf(mddev_t *mddev)
2241 if (!conf->thread) 2241 if (!conf->thread)
2242 goto out; 2242 goto out;
2243 2243
2244 conf->scale_disks = 0;
2245 conf->mddev = mddev; 2244 conf->mddev = mddev;
2246 return conf; 2245 return conf;
2247 2246
@@ -2300,11 +2299,6 @@ static int run(mddev_t *mddev)
2300 if (disk_idx >= conf->raid_disks 2299 if (disk_idx >= conf->raid_disks
2301 || disk_idx < 0) 2300 || disk_idx < 0)
2302 continue; 2301 continue;
2303 if (conf->scale_disks) {
2304 disk_idx *= conf->scale_disks;
2305 rdev->raid_disk = disk_idx;
2306 /* MOVE 'rd%d' link !! */
2307 }
2308 disk = conf->mirrors + disk_idx; 2302 disk = conf->mirrors + disk_idx;
2309 2303
2310 disk->rdev = rdev; 2304 disk->rdev = rdev;
@@ -2435,26 +2429,22 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
2435 return ERR_PTR(-EINVAL); 2429 return ERR_PTR(-EINVAL);
2436 } 2430 }
2437 2431
2438 /* Update slot numbers to obtain
2439 * degraded raid10 with missing mirrors
2440 */
2441 list_for_each_entry(rdev, &mddev->disks, same_set) {
2442 rdev->raid_disk *= 2;
2443 }
2444
2445 /* Set new parameters */ 2432 /* Set new parameters */
2446 mddev->new_level = 10; 2433 mddev->new_level = 10;
2447 /* new layout: far_copies = 1, near_copies = 2 */ 2434 /* new layout: far_copies = 1, near_copies = 2 */
2448 mddev->new_layout = (1<<8) + 2; 2435 mddev->new_layout = (1<<8) + 2;
2449 mddev->new_chunk_sectors = mddev->chunk_sectors; 2436 mddev->new_chunk_sectors = mddev->chunk_sectors;
2450 mddev->delta_disks = mddev->raid_disks; 2437 mddev->delta_disks = mddev->raid_disks;
2451 mddev->degraded = mddev->raid_disks;
2452 mddev->raid_disks *= 2; 2438 mddev->raid_disks *= 2;
2453 /* make sure it will be not marked as dirty */ 2439 /* make sure it will be not marked as dirty */
2454 mddev->recovery_cp = MaxSector; 2440 mddev->recovery_cp = MaxSector;
2455 2441
2456 conf = setup_conf(mddev); 2442 conf = setup_conf(mddev);
2457 conf->scale_disks = 2; 2443 if (!IS_ERR(conf))
2444 list_for_each_entry(rdev, &mddev->disks, same_set)
2445 if (rdev->raid_disk >= 0)
2446 rdev->new_raid_disk = rdev->raid_disk * 2;
2447
2458 return conf; 2448 return conf;
2459} 2449}
2460 2450
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3824a087e17c..2316ac2e8e21 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -38,11 +38,6 @@ struct r10_private_data_s {
38 int chunk_shift; /* shift from chunks to sectors */ 38 int chunk_shift; /* shift from chunks to sectors */
39 sector_t chunk_mask; 39 sector_t chunk_mask;
40 40
41 int scale_disks; /* When starting array, multiply
42 * each ->raid_disk by this.
43 * Need for raid0->raid10 migration
44 */
45
46 struct list_head retry_list; 41 struct list_head retry_list;
47 /* queue pending writes and submit them on unplug */ 42 /* queue pending writes and submit them on unplug */
48 struct bio_list pending_bio_list; 43 struct bio_list pending_bio_list;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d2c0f94fa37d..96c690279fc6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -277,12 +277,13 @@ out:
277 return sh; 277 return sh;
278} 278}
279 279
280static void shrink_buffers(struct stripe_head *sh, int num) 280static void shrink_buffers(struct stripe_head *sh)
281{ 281{
282 struct page *p; 282 struct page *p;
283 int i; 283 int i;
284 int num = sh->raid_conf->pool_size;
284 285
285 for (i=0; i<num ; i++) { 286 for (i = 0; i < num ; i++) {
286 p = sh->dev[i].page; 287 p = sh->dev[i].page;
287 if (!p) 288 if (!p)
288 continue; 289 continue;
@@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num)
291 } 292 }
292} 293}
293 294
294static int grow_buffers(struct stripe_head *sh, int num) 295static int grow_buffers(struct stripe_head *sh)
295{ 296{
296 int i; 297 int i;
298 int num = sh->raid_conf->pool_size;
297 299
298 for (i=0; i<num; i++) { 300 for (i = 0; i < num; i++) {
299 struct page *page; 301 struct page *page;
300 302
301 if (!(page = alloc_page(GFP_KERNEL))) { 303 if (!(page = alloc_page(GFP_KERNEL))) {
@@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
364 return NULL; 366 return NULL;
365} 367}
366 368
369/*
370 * Need to check if array has failed when deciding whether to:
371 * - start an array
372 * - remove non-faulty devices
373 * - add a spare
374 * - allow a reshape
375 * This determination is simple when no reshape is happening.
376 * However if there is a reshape, we need to carefully check
377 * both the before and after sections.
378 * This is because some failed devices may only affect one
379 * of the two sections, and some non-in_sync devices may
380 * be insync in the section most affected by failed devices.
381 */
382static int has_failed(raid5_conf_t *conf)
383{
384 int degraded;
385 int i;
386 if (conf->mddev->reshape_position == MaxSector)
387 return conf->mddev->degraded > conf->max_degraded;
388
389 rcu_read_lock();
390 degraded = 0;
391 for (i = 0; i < conf->previous_raid_disks; i++) {
392 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
393 if (!rdev || test_bit(Faulty, &rdev->flags))
394 degraded++;
395 else if (test_bit(In_sync, &rdev->flags))
396 ;
397 else
398 /* not in-sync or faulty.
399 * If the reshape increases the number of devices,
400 * this is being recovered by the reshape, so
401 * this 'previous' section is not in_sync.
402 * If the number of devices is being reduced however,
403 * the device can only be part of the array if
404 * we are reverting a reshape, so this section will
405 * be in-sync.
406 */
407 if (conf->raid_disks >= conf->previous_raid_disks)
408 degraded++;
409 }
410 rcu_read_unlock();
411 if (degraded > conf->max_degraded)
412 return 1;
413 rcu_read_lock();
414 degraded = 0;
415 for (i = 0; i < conf->raid_disks; i++) {
416 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
417 if (!rdev || test_bit(Faulty, &rdev->flags))
418 degraded++;
419 else if (test_bit(In_sync, &rdev->flags))
420 ;
421 else
422 /* not in-sync or faulty.
423 * If reshape increases the number of devices, this
424 * section has already been recovered, else it
425 * almost certainly hasn't.
426 */
427 if (conf->raid_disks <= conf->previous_raid_disks)
428 degraded++;
429 }
430 rcu_read_unlock();
431 if (degraded > conf->max_degraded)
432 return 1;
433 return 0;
434}
435
367static void unplug_slaves(mddev_t *mddev); 436static void unplug_slaves(mddev_t *mddev);
368static void raid5_unplug_device(struct request_queue *q); 437static void raid5_unplug_device(struct request_queue *q);
369 438
@@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1240static int grow_one_stripe(raid5_conf_t *conf) 1309static int grow_one_stripe(raid5_conf_t *conf)
1241{ 1310{
1242 struct stripe_head *sh; 1311 struct stripe_head *sh;
1243 int disks = max(conf->raid_disks, conf->previous_raid_disks);
1244 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1312 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
1245 if (!sh) 1313 if (!sh)
1246 return 0; 1314 return 0;
1247 memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); 1315 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
1248 sh->raid_conf = conf; 1316 sh->raid_conf = conf;
1249 spin_lock_init(&sh->lock); 1317 spin_lock_init(&sh->lock);
1250 #ifdef CONFIG_MULTICORE_RAID456 1318 #ifdef CONFIG_MULTICORE_RAID456
1251 init_waitqueue_head(&sh->ops.wait_for_ops); 1319 init_waitqueue_head(&sh->ops.wait_for_ops);
1252 #endif 1320 #endif
1253 1321
1254 if (grow_buffers(sh, disks)) { 1322 if (grow_buffers(sh)) {
1255 shrink_buffers(sh, disks); 1323 shrink_buffers(sh);
1256 kmem_cache_free(conf->slab_cache, sh); 1324 kmem_cache_free(conf->slab_cache, sh);
1257 return 0; 1325 return 0;
1258 } 1326 }
@@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
1468 if (!sh) 1536 if (!sh)
1469 return 0; 1537 return 0;
1470 BUG_ON(atomic_read(&sh->count)); 1538 BUG_ON(atomic_read(&sh->count));
1471 shrink_buffers(sh, conf->pool_size); 1539 shrink_buffers(sh);
1472 kmem_cache_free(conf->slab_cache, sh); 1540 kmem_cache_free(conf->slab_cache, sh);
1473 atomic_dec(&conf->active_stripes); 1541 atomic_dec(&conf->active_stripes);
1474 return 1; 1542 return 1;
@@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh)
2963 mdk_rdev_t *rdev; 3031 mdk_rdev_t *rdev;
2964 3032
2965 dev = &sh->dev[i]; 3033 dev = &sh->dev[i];
2966 clear_bit(R5_Insync, &dev->flags);
2967 3034
2968 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3035 pr_debug("check %d: state 0x%lx toread %p read %p write %p "
2969 "written %p\n", i, dev->flags, dev->toread, dev->read, 3036 "written %p\n", i, dev->flags, dev->toread, dev->read,
@@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh)
3000 blocked_rdev = rdev; 3067 blocked_rdev = rdev;
3001 atomic_inc(&rdev->nr_pending); 3068 atomic_inc(&rdev->nr_pending);
3002 } 3069 }
3003 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3070 clear_bit(R5_Insync, &dev->flags);
3071 if (!rdev)
3072 /* Not in-sync */;
3073 else if (test_bit(In_sync, &rdev->flags))
3074 set_bit(R5_Insync, &dev->flags);
3075 else {
3076 /* could be in-sync depending on recovery/reshape status */
3077 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3078 set_bit(R5_Insync, &dev->flags);
3079 }
3080 if (!test_bit(R5_Insync, &dev->flags)) {
3004 /* The ReadError flag will just be confusing now */ 3081 /* The ReadError flag will just be confusing now */
3005 clear_bit(R5_ReadError, &dev->flags); 3082 clear_bit(R5_ReadError, &dev->flags);
3006 clear_bit(R5_ReWrite, &dev->flags); 3083 clear_bit(R5_ReWrite, &dev->flags);
3007 } 3084 }
3008 if (!rdev || !test_bit(In_sync, &rdev->flags) 3085 if (test_bit(R5_ReadError, &dev->flags))
3009 || test_bit(R5_ReadError, &dev->flags)) { 3086 clear_bit(R5_Insync, &dev->flags);
3087 if (!test_bit(R5_Insync, &dev->flags)) {
3010 s.failed++; 3088 s.failed++;
3011 s.failed_num = i; 3089 s.failed_num = i;
3012 } else 3090 }
3013 set_bit(R5_Insync, &dev->flags);
3014 } 3091 }
3015 rcu_read_unlock(); 3092 rcu_read_unlock();
3016 3093
@@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh)
3244 for (i=disks; i--; ) { 3321 for (i=disks; i--; ) {
3245 mdk_rdev_t *rdev; 3322 mdk_rdev_t *rdev;
3246 dev = &sh->dev[i]; 3323 dev = &sh->dev[i];
3247 clear_bit(R5_Insync, &dev->flags);
3248 3324
3249 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3325 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3250 i, dev->flags, dev->toread, dev->towrite, dev->written); 3326 i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh)
3282 blocked_rdev = rdev; 3358 blocked_rdev = rdev;
3283 atomic_inc(&rdev->nr_pending); 3359 atomic_inc(&rdev->nr_pending);
3284 } 3360 }
3285 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3361 clear_bit(R5_Insync, &dev->flags);
3362 if (!rdev)
3363 /* Not in-sync */;
3364 else if (test_bit(In_sync, &rdev->flags))
3365 set_bit(R5_Insync, &dev->flags);
3366 else {
3367 /* in sync if before recovery_offset */
3368 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3369 set_bit(R5_Insync, &dev->flags);
3370 }
3371 if (!test_bit(R5_Insync, &dev->flags)) {
3286 /* The ReadError flag will just be confusing now */ 3372 /* The ReadError flag will just be confusing now */
3287 clear_bit(R5_ReadError, &dev->flags); 3373 clear_bit(R5_ReadError, &dev->flags);
3288 clear_bit(R5_ReWrite, &dev->flags); 3374 clear_bit(R5_ReWrite, &dev->flags);
3289 } 3375 }
3290 if (!rdev || !test_bit(In_sync, &rdev->flags) 3376 if (test_bit(R5_ReadError, &dev->flags))
3291 || test_bit(R5_ReadError, &dev->flags)) { 3377 clear_bit(R5_Insync, &dev->flags);
3378 if (!test_bit(R5_Insync, &dev->flags)) {
3292 if (s.failed < 2) 3379 if (s.failed < 2)
3293 r6s.failed_num[s.failed] = i; 3380 r6s.failed_num[s.failed] = i;
3294 s.failed++; 3381 s.failed++;
3295 } else 3382 }
3296 set_bit(R5_Insync, &dev->flags);
3297 } 3383 }
3298 rcu_read_unlock(); 3384 rcu_read_unlock();
3299 3385
@@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev)
4971 list_for_each_entry(rdev, &mddev->disks, same_set) { 5057 list_for_each_entry(rdev, &mddev->disks, same_set) {
4972 if (rdev->raid_disk < 0) 5058 if (rdev->raid_disk < 0)
4973 continue; 5059 continue;
4974 if (test_bit(In_sync, &rdev->flags)) 5060 if (test_bit(In_sync, &rdev->flags)) {
4975 working_disks++; 5061 working_disks++;
5062 continue;
5063 }
4976 /* This disc is not fully in-sync. However if it 5064 /* This disc is not fully in-sync. However if it
4977 * just stored parity (beyond the recovery_offset), 5065 * just stored parity (beyond the recovery_offset),
4978 * when we don't need to be concerned about the 5066 * when we don't need to be concerned about the
@@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev)
5005 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 5093 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
5006 - working_disks); 5094 - working_disks);
5007 5095
5008 if (mddev->degraded > conf->max_degraded) { 5096 if (has_failed(conf)) {
5009 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5097 printk(KERN_ERR "md/raid:%s: not enough operational devices"
5010 " (%d/%d failed)\n", 5098 " (%d/%d failed)\n",
5011 mdname(mddev), mddev->degraded, conf->raid_disks); 5099 mdname(mddev), mddev->degraded, conf->raid_disks);
@@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev)
5207 for (i = 0; i < conf->raid_disks; i++) { 5295 for (i = 0; i < conf->raid_disks; i++) {
5208 tmp = conf->disks + i; 5296 tmp = conf->disks + i;
5209 if (tmp->rdev 5297 if (tmp->rdev
5298 && tmp->rdev->recovery_offset == MaxSector
5210 && !test_bit(Faulty, &tmp->rdev->flags) 5299 && !test_bit(Faulty, &tmp->rdev->flags)
5211 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5300 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5212 unsigned long flags; 5301 unsigned long flags;
@@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
5242 * isn't possible. 5331 * isn't possible.
5243 */ 5332 */
5244 if (!test_bit(Faulty, &rdev->flags) && 5333 if (!test_bit(Faulty, &rdev->flags) &&
5245 mddev->degraded <= conf->max_degraded && 5334 !has_failed(conf) &&
5246 number < conf->raid_disks) { 5335 number < conf->raid_disks) {
5247 err = -EBUSY; 5336 err = -EBUSY;
5248 goto abort; 5337 goto abort;
@@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5270 int first = 0; 5359 int first = 0;
5271 int last = conf->raid_disks - 1; 5360 int last = conf->raid_disks - 1;
5272 5361
5273 if (mddev->degraded > conf->max_degraded) 5362 if (has_failed(conf))
5274 /* no point adding a device */ 5363 /* no point adding a device */
5275 return -EINVAL; 5364 return -EINVAL;
5276 5365
@@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev)
5362 if (mddev->bitmap) 5451 if (mddev->bitmap)
5363 /* Cannot grow a bitmap yet */ 5452 /* Cannot grow a bitmap yet */
5364 return -EBUSY; 5453 return -EBUSY;
5365 if (mddev->degraded > conf->max_degraded) 5454 if (has_failed(conf))
5366 return -EINVAL; 5455 return -EINVAL;
5367 if (mddev->delta_disks < 0) { 5456 if (mddev->delta_disks < 0) {
5368 /* We might be able to shrink, but the devices must 5457 /* We might be able to shrink, but the devices must
@@ -5437,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev)
5437 5526
5438 /* Add some new drives, as many as will fit. 5527 /* Add some new drives, as many as will fit.
5439 * We know there are enough to make the newly sized array work. 5528 * We know there are enough to make the newly sized array work.
5529 * Don't add devices if we are reducing the number of
5530 * devices in the array. This is because it is not possible
5531 * to correctly record the "partially reconstructed" state of
5532 * such devices during the reshape and confusion could result.
5440 */ 5533 */
5441 list_for_each_entry(rdev, &mddev->disks, same_set) 5534 if (mddev->delta_disks >= 0)
5535 list_for_each_entry(rdev, &mddev->disks, same_set)
5442 if (rdev->raid_disk < 0 && 5536 if (rdev->raid_disk < 0 &&
5443 !test_bit(Faulty, &rdev->flags)) { 5537 !test_bit(Faulty, &rdev->flags)) {
5444 if (raid5_add_disk(mddev, rdev) == 0) { 5538 if (raid5_add_disk(mddev, rdev) == 0) {
@@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev)
5460 } 5554 }
5461 5555
5462 /* When a reshape changes the number of devices, ->degraded 5556 /* When a reshape changes the number of devices, ->degraded
5463 * is measured against the large of the pre and post number of 5557 * is measured against the larger of the pre and post number of
5464 * devices.*/ 5558 * devices.*/
5465 if (mddev->delta_disks > 0) { 5559 if (mddev->delta_disks > 0) {
5466 spin_lock_irqsave(&conf->device_lock, flags); 5560 spin_lock_irqsave(&conf->device_lock, flags);