aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2007-02-28 23:11:53 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-03-01 17:53:36 -0500
commitf416885ef4950501dd3858d1afa1137a0c2905c5 (patch)
tree77003957b071e5728fefedd905bf6f8c348fb4b3 /drivers/md/raid5.c
parentb4c4c7b8095298ff4ce20b40bf180ada070812d0 (diff)
[PATCH] md: add support for reshape of a raid6
i.e. one or more drives can be added and the array will re-stripe while on-line. Most of the interesting work was already done for raid5. This just extends it to raid6. mdadm newer than 2.6 is needed for complete safety, however any version of mdadm which support raid5 reshape will do a good enough job in almost all cases (an 'echo repair > /sys/block/mdX/md/sync_action' is recommended after a reshape that was aborted and had to be restarted with an such a version of mdadm). Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c157
1 files changed, 124 insertions, 33 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 29fc06b47d4e..d247429ee5ef 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1050,7 +1050,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
1050static void compute_parity6(struct stripe_head *sh, int method) 1050static void compute_parity6(struct stripe_head *sh, int method)
1051{ 1051{
1052 raid6_conf_t *conf = sh->raid_conf; 1052 raid6_conf_t *conf = sh->raid_conf;
1053 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; 1053 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1054 struct bio *chosen; 1054 struct bio *chosen;
1055 /**** FIX THIS: This could be very bad if disks is close to 256 ****/ 1055 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1056 void *ptrs[disks]; 1056 void *ptrs[disks];
@@ -1131,8 +1131,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
1131/* Compute one missing block */ 1131/* Compute one missing block */
1132static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) 1132static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1133{ 1133{
1134 raid6_conf_t *conf = sh->raid_conf; 1134 int i, count, disks = sh->disks;
1135 int i, count, disks = conf->raid_disks;
1136 void *ptr[MAX_XOR_BLOCKS], *p; 1135 void *ptr[MAX_XOR_BLOCKS], *p;
1137 int pd_idx = sh->pd_idx; 1136 int pd_idx = sh->pd_idx;
1138 int qd_idx = raid6_next_disk(pd_idx, disks); 1137 int qd_idx = raid6_next_disk(pd_idx, disks);
@@ -1170,8 +1169,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1170/* Compute two missing blocks */ 1169/* Compute two missing blocks */
1171static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) 1170static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1172{ 1171{
1173 raid6_conf_t *conf = sh->raid_conf; 1172 int i, count, disks = sh->disks;
1174 int i, count, disks = conf->raid_disks;
1175 int pd_idx = sh->pd_idx; 1173 int pd_idx = sh->pd_idx;
1176 int qd_idx = raid6_next_disk(pd_idx, disks); 1174 int qd_idx = raid6_next_disk(pd_idx, disks);
1177 int d0_idx = raid6_next_disk(qd_idx, disks); 1175 int d0_idx = raid6_next_disk(qd_idx, disks);
@@ -1887,11 +1885,11 @@ static void handle_stripe5(struct stripe_head *sh)
1887static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 1885static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1888{ 1886{
1889 raid6_conf_t *conf = sh->raid_conf; 1887 raid6_conf_t *conf = sh->raid_conf;
1890 int disks = conf->raid_disks; 1888 int disks = sh->disks;
1891 struct bio *return_bi= NULL; 1889 struct bio *return_bi= NULL;
1892 struct bio *bi; 1890 struct bio *bi;
1893 int i; 1891 int i;
1894 int syncing; 1892 int syncing, expanding, expanded;
1895 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; 1893 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1896 int non_overwrite = 0; 1894 int non_overwrite = 0;
1897 int failed_num[2] = {0, 0}; 1895 int failed_num[2] = {0, 0};
@@ -1909,6 +1907,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1909 clear_bit(STRIPE_DELAYED, &sh->state); 1907 clear_bit(STRIPE_DELAYED, &sh->state);
1910 1908
1911 syncing = test_bit(STRIPE_SYNCING, &sh->state); 1909 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1910 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1911 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1912 /* Now to look around and see what can be done */ 1912 /* Now to look around and see what can be done */
1913 1913
1914 rcu_read_lock(); 1914 rcu_read_lock();
@@ -2114,13 +2114,15 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2114 * parity, or to satisfy requests 2114 * parity, or to satisfy requests
2115 * or to load a block that is being partially written. 2115 * or to load a block that is being partially written.
2116 */ 2116 */
2117 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) { 2117 if (to_read || non_overwrite || (to_write && failed) ||
2118 (syncing && (uptodate < disks)) || expanding) {
2118 for (i=disks; i--;) { 2119 for (i=disks; i--;) {
2119 dev = &sh->dev[i]; 2120 dev = &sh->dev[i];
2120 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 2121 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2121 (dev->toread || 2122 (dev->toread ||
2122 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2123 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2123 syncing || 2124 syncing ||
2125 expanding ||
2124 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || 2126 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
2125 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) 2127 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
2126 ) 2128 )
@@ -2355,6 +2357,79 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2355 } 2357 }
2356 } 2358 }
2357 } 2359 }
2360
2361 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
2362 /* Need to write out all blocks after computing P&Q */
2363 sh->disks = conf->raid_disks;
2364 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2365 conf->raid_disks);
2366 compute_parity6(sh, RECONSTRUCT_WRITE);
2367 for (i = conf->raid_disks ; i-- ; ) {
2368 set_bit(R5_LOCKED, &sh->dev[i].flags);
2369 locked++;
2370 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2371 }
2372 clear_bit(STRIPE_EXPANDING, &sh->state);
2373 } else if (expanded) {
2374 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2375 atomic_dec(&conf->reshape_stripes);
2376 wake_up(&conf->wait_for_overlap);
2377 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2378 }
2379
2380 if (expanding && locked == 0) {
2381 /* We have read all the blocks in this stripe and now we need to
2382 * copy some of them into a target stripe for expand.
2383 */
2384 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2385 for (i = 0; i < sh->disks ; i++)
2386 if (i != pd_idx && i != qd_idx) {
2387 int dd_idx2, pd_idx2, j;
2388 struct stripe_head *sh2;
2389
2390 sector_t bn = compute_blocknr(sh, i);
2391 sector_t s = raid5_compute_sector(
2392 bn, conf->raid_disks,
2393 conf->raid_disks - conf->max_degraded,
2394 &dd_idx2, &pd_idx2, conf);
2395 sh2 = get_active_stripe(conf, s,
2396 conf->raid_disks,
2397 pd_idx2, 1);
2398 if (sh2 == NULL)
2399 /* so for only the early blocks of
2400 * this stripe have been requests.
2401 * When later blocks get requests, we
2402 * will try again
2403 */
2404 continue;
2405 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2406 test_bit(R5_Expanded,
2407 &sh2->dev[dd_idx2].flags)) {
2408 /* must have already done this block */
2409 release_stripe(sh2);
2410 continue;
2411 }
2412 memcpy(page_address(sh2->dev[dd_idx2].page),
2413 page_address(sh->dev[i].page),
2414 STRIPE_SIZE);
2415 set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
2416 set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
2417 for (j = 0 ; j < conf->raid_disks ; j++)
2418 if (j != sh2->pd_idx &&
2419 j != raid6_next_disk(sh2->pd_idx,
2420 sh2->disks) &&
2421 !test_bit(R5_Expanded,
2422 &sh2->dev[j].flags))
2423 break;
2424 if (j == conf->raid_disks) {
2425 set_bit(STRIPE_EXPAND_READY,
2426 &sh2->state);
2427 set_bit(STRIPE_HANDLE, &sh2->state);
2428 }
2429 release_stripe(sh2);
2430 }
2431 }
2432
2358 spin_unlock(&sh->lock); 2433 spin_unlock(&sh->lock);
2359 2434
2360 while ((bi=return_bi)) { 2435 while ((bi=return_bi)) {
@@ -2395,7 +2470,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2395 rcu_read_unlock(); 2470 rcu_read_unlock();
2396 2471
2397 if (rdev) { 2472 if (rdev) {
2398 if (syncing) 2473 if (syncing || expanding || expanded)
2399 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 2474 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
2400 2475
2401 bi->bi_bdev = rdev->bdev; 2476 bi->bi_bdev = rdev->bdev;
@@ -2915,8 +2990,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2915 struct stripe_head *sh; 2990 struct stripe_head *sh;
2916 int pd_idx; 2991 int pd_idx;
2917 sector_t first_sector, last_sector; 2992 sector_t first_sector, last_sector;
2918 int raid_disks; 2993 int raid_disks = conf->previous_raid_disks;
2919 int data_disks; 2994 int data_disks = raid_disks - conf->max_degraded;
2995 int new_data_disks = conf->raid_disks - conf->max_degraded;
2920 int i; 2996 int i;
2921 int dd_idx; 2997 int dd_idx;
2922 sector_t writepos, safepos, gap; 2998 sector_t writepos, safepos, gap;
@@ -2925,7 +3001,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2925 conf->expand_progress != 0) { 3001 conf->expand_progress != 0) {
2926 /* restarting in the middle, skip the initial sectors */ 3002 /* restarting in the middle, skip the initial sectors */
2927 sector_nr = conf->expand_progress; 3003 sector_nr = conf->expand_progress;
2928 sector_div(sector_nr, conf->raid_disks-1); 3004 sector_div(sector_nr, new_data_disks);
2929 *skipped = 1; 3005 *skipped = 1;
2930 return sector_nr; 3006 return sector_nr;
2931 } 3007 }
@@ -2939,14 +3015,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2939 * to after where expand_lo old_maps to 3015 * to after where expand_lo old_maps to
2940 */ 3016 */
2941 writepos = conf->expand_progress + 3017 writepos = conf->expand_progress +
2942 conf->chunk_size/512*(conf->raid_disks-1); 3018 conf->chunk_size/512*(new_data_disks);
2943 sector_div(writepos, conf->raid_disks-1); 3019 sector_div(writepos, new_data_disks);
2944 safepos = conf->expand_lo; 3020 safepos = conf->expand_lo;
2945 sector_div(safepos, conf->previous_raid_disks-1); 3021 sector_div(safepos, data_disks);
2946 gap = conf->expand_progress - conf->expand_lo; 3022 gap = conf->expand_progress - conf->expand_lo;
2947 3023
2948 if (writepos >= safepos || 3024 if (writepos >= safepos ||
2949 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) { 3025 gap > (new_data_disks)*3000*2 /*3Meg*/) {
2950 /* Cannot proceed until we've updated the superblock... */ 3026 /* Cannot proceed until we've updated the superblock... */
2951 wait_event(conf->wait_for_overlap, 3027 wait_event(conf->wait_for_overlap,
2952 atomic_read(&conf->reshape_stripes)==0); 3028 atomic_read(&conf->reshape_stripes)==0);
@@ -2976,6 +3052,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2976 sector_t s; 3052 sector_t s;
2977 if (j == sh->pd_idx) 3053 if (j == sh->pd_idx)
2978 continue; 3054 continue;
3055 if (conf->level == 6 &&
3056 j == raid6_next_disk(sh->pd_idx, sh->disks))
3057 continue;
2979 s = compute_blocknr(sh, j); 3058 s = compute_blocknr(sh, j);
2980 if (s < (mddev->array_size<<1)) { 3059 if (s < (mddev->array_size<<1)) {
2981 skipped = 1; 3060 skipped = 1;
@@ -2999,21 +3078,20 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2999 * The source stripes are determined by mapping the first and last 3078 * The source stripes are determined by mapping the first and last
3000 * block on the destination stripes. 3079 * block on the destination stripes.
3001 */ 3080 */
3002 raid_disks = conf->previous_raid_disks;
3003 data_disks = raid_disks - 1;
3004 first_sector = 3081 first_sector =
3005 raid5_compute_sector(sector_nr*(conf->raid_disks-1), 3082 raid5_compute_sector(sector_nr*(new_data_disks),
3006 raid_disks, data_disks, 3083 raid_disks, data_disks,
3007 &dd_idx, &pd_idx, conf); 3084 &dd_idx, &pd_idx, conf);
3008 last_sector = 3085 last_sector =
3009 raid5_compute_sector((sector_nr+conf->chunk_size/512) 3086 raid5_compute_sector((sector_nr+conf->chunk_size/512)
3010 *(conf->raid_disks-1) -1, 3087 *(new_data_disks) -1,
3011 raid_disks, data_disks, 3088 raid_disks, data_disks,
3012 &dd_idx, &pd_idx, conf); 3089 &dd_idx, &pd_idx, conf);
3013 if (last_sector >= (mddev->size<<1)) 3090 if (last_sector >= (mddev->size<<1))
3014 last_sector = (mddev->size<<1)-1; 3091 last_sector = (mddev->size<<1)-1;
3015 while (first_sector <= last_sector) { 3092 while (first_sector <= last_sector) {
3016 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); 3093 pd_idx = stripe_to_pdidx(first_sector, conf,
3094 conf->previous_raid_disks);
3017 sh = get_active_stripe(conf, first_sector, 3095 sh = get_active_stripe(conf, first_sector,
3018 conf->previous_raid_disks, pd_idx, 0); 3096 conf->previous_raid_disks, pd_idx, 0);
3019 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3097 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
@@ -3348,35 +3426,44 @@ static int run(mddev_t *mddev)
3348 */ 3426 */
3349 sector_t here_new, here_old; 3427 sector_t here_new, here_old;
3350 int old_disks; 3428 int old_disks;
3429 int max_degraded = (mddev->level == 5 ? 1 : 2);
3351 3430
3352 if (mddev->new_level != mddev->level || 3431 if (mddev->new_level != mddev->level ||
3353 mddev->new_layout != mddev->layout || 3432 mddev->new_layout != mddev->layout ||
3354 mddev->new_chunk != mddev->chunk_size) { 3433 mddev->new_chunk != mddev->chunk_size) {
3355 printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n", 3434 printk(KERN_ERR "raid5: %s: unsupported reshape "
3435 "required - aborting.\n",
3356 mdname(mddev)); 3436 mdname(mddev));
3357 return -EINVAL; 3437 return -EINVAL;
3358 } 3438 }
3359 if (mddev->delta_disks <= 0) { 3439 if (mddev->delta_disks <= 0) {
3360 printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n", 3440 printk(KERN_ERR "raid5: %s: unsupported reshape "
3441 "(reduce disks) required - aborting.\n",
3361 mdname(mddev)); 3442 mdname(mddev));
3362 return -EINVAL; 3443 return -EINVAL;
3363 } 3444 }
3364 old_disks = mddev->raid_disks - mddev->delta_disks; 3445 old_disks = mddev->raid_disks - mddev->delta_disks;
3365 /* reshape_position must be on a new-stripe boundary, and one 3446 /* reshape_position must be on a new-stripe boundary, and one
3366 * further up in new geometry must map after here in old geometry. 3447 * further up in new geometry must map after here in old
3448 * geometry.
3367 */ 3449 */
3368 here_new = mddev->reshape_position; 3450 here_new = mddev->reshape_position;
3369 if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) { 3451 if (sector_div(here_new, (mddev->chunk_size>>9)*
3370 printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n"); 3452 (mddev->raid_disks - max_degraded))) {
3453 printk(KERN_ERR "raid5: reshape_position not "
3454 "on a stripe boundary\n");
3371 return -EINVAL; 3455 return -EINVAL;
3372 } 3456 }
3373 /* here_new is the stripe we will write to */ 3457 /* here_new is the stripe we will write to */
3374 here_old = mddev->reshape_position; 3458 here_old = mddev->reshape_position;
3375 sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1)); 3459 sector_div(here_old, (mddev->chunk_size>>9)*
3376 /* here_old is the first stripe that we might need to read from */ 3460 (old_disks-max_degraded));
3461 /* here_old is the first stripe that we might need to read
3462 * from */
3377 if (here_new >= here_old) { 3463 if (here_new >= here_old) {
3378 /* Reading from the same stripe as writing to - bad */ 3464 /* Reading from the same stripe as writing to - bad */
3379 printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n"); 3465 printk(KERN_ERR "raid5: reshape_position too early for "
3466 "auto-recovery - aborting.\n");
3380 return -EINVAL; 3467 return -EINVAL;
3381 } 3468 }
3382 printk(KERN_INFO "raid5: reshape will continue\n"); 3469 printk(KERN_INFO "raid5: reshape will continue\n");
@@ -3829,8 +3916,7 @@ static int raid5_start_reshape(mddev_t *mddev)
3829 int added_devices = 0; 3916 int added_devices = 0;
3830 unsigned long flags; 3917 unsigned long flags;
3831 3918
3832 if (mddev->degraded || 3919 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3833 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3834 return -EBUSY; 3920 return -EBUSY;
3835 3921
3836 ITERATE_RDEV(mddev, rdev, rtmp) 3922 ITERATE_RDEV(mddev, rdev, rtmp)
@@ -3838,7 +3924,7 @@ static int raid5_start_reshape(mddev_t *mddev)
3838 !test_bit(Faulty, &rdev->flags)) 3924 !test_bit(Faulty, &rdev->flags))
3839 spares++; 3925 spares++;
3840 3926
3841 if (spares < mddev->delta_disks-1) 3927 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
3842 /* Not enough devices even to make a degraded array 3928 /* Not enough devices even to make a degraded array
3843 * of that size 3929 * of that size
3844 */ 3930 */
@@ -3901,7 +3987,8 @@ static void end_reshape(raid5_conf_t *conf)
3901 struct block_device *bdev; 3987 struct block_device *bdev;
3902 3988
3903 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 3989 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
3904 conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1); 3990 conf->mddev->array_size = conf->mddev->size *
3991 (conf->raid_disks - conf->max_degraded);
3905 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); 3992 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
3906 conf->mddev->changed = 1; 3993 conf->mddev->changed = 1;
3907 3994
@@ -3974,6 +4061,10 @@ static struct mdk_personality raid6_personality =
3974 .spare_active = raid5_spare_active, 4061 .spare_active = raid5_spare_active,
3975 .sync_request = sync_request, 4062 .sync_request = sync_request,
3976 .resize = raid5_resize, 4063 .resize = raid5_resize,
4064#ifdef CONFIG_MD_RAID5_RESHAPE
4065 .check_reshape = raid5_check_reshape,
4066 .start_reshape = raid5_start_reshape,
4067#endif
3977 .quiesce = raid5_quiesce, 4068 .quiesce = raid5_quiesce,
3978}; 4069};
3979static struct mdk_personality raid5_personality = 4070static struct mdk_personality raid5_personality =