aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c157
1 files changed, 124 insertions, 33 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 29fc06b47d4e..d247429ee5ef 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1050,7 +1050,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
1050static void compute_parity6(struct stripe_head *sh, int method) 1050static void compute_parity6(struct stripe_head *sh, int method)
1051{ 1051{
1052 raid6_conf_t *conf = sh->raid_conf; 1052 raid6_conf_t *conf = sh->raid_conf;
1053 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; 1053 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1054 struct bio *chosen; 1054 struct bio *chosen;
1055 /**** FIX THIS: This could be very bad if disks is close to 256 ****/ 1055 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1056 void *ptrs[disks]; 1056 void *ptrs[disks];
@@ -1131,8 +1131,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
1131/* Compute one missing block */ 1131/* Compute one missing block */
1132static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) 1132static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1133{ 1133{
1134 raid6_conf_t *conf = sh->raid_conf; 1134 int i, count, disks = sh->disks;
1135 int i, count, disks = conf->raid_disks;
1136 void *ptr[MAX_XOR_BLOCKS], *p; 1135 void *ptr[MAX_XOR_BLOCKS], *p;
1137 int pd_idx = sh->pd_idx; 1136 int pd_idx = sh->pd_idx;
1138 int qd_idx = raid6_next_disk(pd_idx, disks); 1137 int qd_idx = raid6_next_disk(pd_idx, disks);
@@ -1170,8 +1169,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1170/* Compute two missing blocks */ 1169/* Compute two missing blocks */
1171static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) 1170static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1172{ 1171{
1173 raid6_conf_t *conf = sh->raid_conf; 1172 int i, count, disks = sh->disks;
1174 int i, count, disks = conf->raid_disks;
1175 int pd_idx = sh->pd_idx; 1173 int pd_idx = sh->pd_idx;
1176 int qd_idx = raid6_next_disk(pd_idx, disks); 1174 int qd_idx = raid6_next_disk(pd_idx, disks);
1177 int d0_idx = raid6_next_disk(qd_idx, disks); 1175 int d0_idx = raid6_next_disk(qd_idx, disks);
@@ -1887,11 +1885,11 @@ static void handle_stripe5(struct stripe_head *sh)
1887static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 1885static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1888{ 1886{
1889 raid6_conf_t *conf = sh->raid_conf; 1887 raid6_conf_t *conf = sh->raid_conf;
1890 int disks = conf->raid_disks; 1888 int disks = sh->disks;
1891 struct bio *return_bi= NULL; 1889 struct bio *return_bi= NULL;
1892 struct bio *bi; 1890 struct bio *bi;
1893 int i; 1891 int i;
1894 int syncing; 1892 int syncing, expanding, expanded;
1895 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; 1893 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1896 int non_overwrite = 0; 1894 int non_overwrite = 0;
1897 int failed_num[2] = {0, 0}; 1895 int failed_num[2] = {0, 0};
@@ -1909,6 +1907,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
1909 clear_bit(STRIPE_DELAYED, &sh->state); 1907 clear_bit(STRIPE_DELAYED, &sh->state);
1910 1908
1911 syncing = test_bit(STRIPE_SYNCING, &sh->state); 1909 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1910 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1911 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1912 /* Now to look around and see what can be done */ 1912 /* Now to look around and see what can be done */
1913 1913
1914 rcu_read_lock(); 1914 rcu_read_lock();
@@ -2114,13 +2114,15 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2114 * parity, or to satisfy requests 2114 * parity, or to satisfy requests
2115 * or to load a block that is being partially written. 2115 * or to load a block that is being partially written.
2116 */ 2116 */
2117 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) { 2117 if (to_read || non_overwrite || (to_write && failed) ||
2118 (syncing && (uptodate < disks)) || expanding) {
2118 for (i=disks; i--;) { 2119 for (i=disks; i--;) {
2119 dev = &sh->dev[i]; 2120 dev = &sh->dev[i];
2120 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 2121 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
2121 (dev->toread || 2122 (dev->toread ||
2122 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2123 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2123 syncing || 2124 syncing ||
2125 expanding ||
2124 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || 2126 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
2125 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) 2127 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
2126 ) 2128 )
@@ -2355,6 +2357,79 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2355 } 2357 }
2356 } 2358 }
2357 } 2359 }
2360
2361 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
2362 /* Need to write out all blocks after computing P&Q */
2363 sh->disks = conf->raid_disks;
2364 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2365 conf->raid_disks);
2366 compute_parity6(sh, RECONSTRUCT_WRITE);
2367 for (i = conf->raid_disks ; i-- ; ) {
2368 set_bit(R5_LOCKED, &sh->dev[i].flags);
2369 locked++;
2370 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2371 }
2372 clear_bit(STRIPE_EXPANDING, &sh->state);
2373 } else if (expanded) {
2374 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2375 atomic_dec(&conf->reshape_stripes);
2376 wake_up(&conf->wait_for_overlap);
2377 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
2378 }
2379
2380 if (expanding && locked == 0) {
2381 /* We have read all the blocks in this stripe and now we need to
2382 * copy some of them into a target stripe for expand.
2383 */
2384 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2385 for (i = 0; i < sh->disks ; i++)
2386 if (i != pd_idx && i != qd_idx) {
2387 int dd_idx2, pd_idx2, j;
2388 struct stripe_head *sh2;
2389
2390 sector_t bn = compute_blocknr(sh, i);
2391 sector_t s = raid5_compute_sector(
2392 bn, conf->raid_disks,
2393 conf->raid_disks - conf->max_degraded,
2394 &dd_idx2, &pd_idx2, conf);
2395 sh2 = get_active_stripe(conf, s,
2396 conf->raid_disks,
2397 pd_idx2, 1);
2398 if (sh2 == NULL)
2399 /* so for only the early blocks of
2400 * this stripe have been requests.
2401 * When later blocks get requests, we
2402 * will try again
2403 */
2404 continue;
2405 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2406 test_bit(R5_Expanded,
2407 &sh2->dev[dd_idx2].flags)) {
2408 /* must have already done this block */
2409 release_stripe(sh2);
2410 continue;
2411 }
2412 memcpy(page_address(sh2->dev[dd_idx2].page),
2413 page_address(sh->dev[i].page),
2414 STRIPE_SIZE);
2415 set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
2416 set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
2417 for (j = 0 ; j < conf->raid_disks ; j++)
2418 if (j != sh2->pd_idx &&
2419 j != raid6_next_disk(sh2->pd_idx,
2420 sh2->disks) &&
2421 !test_bit(R5_Expanded,
2422 &sh2->dev[j].flags))
2423 break;
2424 if (j == conf->raid_disks) {
2425 set_bit(STRIPE_EXPAND_READY,
2426 &sh2->state);
2427 set_bit(STRIPE_HANDLE, &sh2->state);
2428 }
2429 release_stripe(sh2);
2430 }
2431 }
2432
2358 spin_unlock(&sh->lock); 2433 spin_unlock(&sh->lock);
2359 2434
2360 while ((bi=return_bi)) { 2435 while ((bi=return_bi)) {
@@ -2395,7 +2470,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2395 rcu_read_unlock(); 2470 rcu_read_unlock();
2396 2471
2397 if (rdev) { 2472 if (rdev) {
2398 if (syncing) 2473 if (syncing || expanding || expanded)
2399 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 2474 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
2400 2475
2401 bi->bi_bdev = rdev->bdev; 2476 bi->bi_bdev = rdev->bdev;
@@ -2915,8 +2990,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2915 struct stripe_head *sh; 2990 struct stripe_head *sh;
2916 int pd_idx; 2991 int pd_idx;
2917 sector_t first_sector, last_sector; 2992 sector_t first_sector, last_sector;
2918 int raid_disks; 2993 int raid_disks = conf->previous_raid_disks;
2919 int data_disks; 2994 int data_disks = raid_disks - conf->max_degraded;
2995 int new_data_disks = conf->raid_disks - conf->max_degraded;
2920 int i; 2996 int i;
2921 int dd_idx; 2997 int dd_idx;
2922 sector_t writepos, safepos, gap; 2998 sector_t writepos, safepos, gap;
@@ -2925,7 +3001,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2925 conf->expand_progress != 0) { 3001 conf->expand_progress != 0) {
2926 /* restarting in the middle, skip the initial sectors */ 3002 /* restarting in the middle, skip the initial sectors */
2927 sector_nr = conf->expand_progress; 3003 sector_nr = conf->expand_progress;
2928 sector_div(sector_nr, conf->raid_disks-1); 3004 sector_div(sector_nr, new_data_disks);
2929 *skipped = 1; 3005 *skipped = 1;
2930 return sector_nr; 3006 return sector_nr;
2931 } 3007 }
@@ -2939,14 +3015,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2939 * to after where expand_lo old_maps to 3015 * to after where expand_lo old_maps to
2940 */ 3016 */
2941 writepos = conf->expand_progress + 3017 writepos = conf->expand_progress +
2942 conf->chunk_size/512*(conf->raid_disks-1); 3018 conf->chunk_size/512*(new_data_disks);
2943 sector_div(writepos, conf->raid_disks-1); 3019 sector_div(writepos, new_data_disks);
2944 safepos = conf->expand_lo; 3020 safepos = conf->expand_lo;
2945 sector_div(safepos, conf->previous_raid_disks-1); 3021 sector_div(safepos, data_disks);
2946 gap = conf->expand_progress - conf->expand_lo; 3022 gap = conf->expand_progress - conf->expand_lo;
2947 3023
2948 if (writepos >= safepos || 3024 if (writepos >= safepos ||
2949 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) { 3025 gap > (new_data_disks)*3000*2 /*3Meg*/) {
2950 /* Cannot proceed until we've updated the superblock... */ 3026 /* Cannot proceed until we've updated the superblock... */
2951 wait_event(conf->wait_for_overlap, 3027 wait_event(conf->wait_for_overlap,
2952 atomic_read(&conf->reshape_stripes)==0); 3028 atomic_read(&conf->reshape_stripes)==0);
@@ -2976,6 +3052,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2976 sector_t s; 3052 sector_t s;
2977 if (j == sh->pd_idx) 3053 if (j == sh->pd_idx)
2978 continue; 3054 continue;
3055 if (conf->level == 6 &&
3056 j == raid6_next_disk(sh->pd_idx, sh->disks))
3057 continue;
2979 s = compute_blocknr(sh, j); 3058 s = compute_blocknr(sh, j);
2980 if (s < (mddev->array_size<<1)) { 3059 if (s < (mddev->array_size<<1)) {
2981 skipped = 1; 3060 skipped = 1;
@@ -2999,21 +3078,20 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
2999 * The source stripes are determined by mapping the first and last 3078 * The source stripes are determined by mapping the first and last
3000 * block on the destination stripes. 3079 * block on the destination stripes.
3001 */ 3080 */
3002 raid_disks = conf->previous_raid_disks;
3003 data_disks = raid_disks - 1;
3004 first_sector = 3081 first_sector =
3005 raid5_compute_sector(sector_nr*(conf->raid_disks-1), 3082 raid5_compute_sector(sector_nr*(new_data_disks),
3006 raid_disks, data_disks, 3083 raid_disks, data_disks,
3007 &dd_idx, &pd_idx, conf); 3084 &dd_idx, &pd_idx, conf);
3008 last_sector = 3085 last_sector =
3009 raid5_compute_sector((sector_nr+conf->chunk_size/512) 3086 raid5_compute_sector((sector_nr+conf->chunk_size/512)
3010 *(conf->raid_disks-1) -1, 3087 *(new_data_disks) -1,
3011 raid_disks, data_disks, 3088 raid_disks, data_disks,
3012 &dd_idx, &pd_idx, conf); 3089 &dd_idx, &pd_idx, conf);
3013 if (last_sector >= (mddev->size<<1)) 3090 if (last_sector >= (mddev->size<<1))
3014 last_sector = (mddev->size<<1)-1; 3091 last_sector = (mddev->size<<1)-1;
3015 while (first_sector <= last_sector) { 3092 while (first_sector <= last_sector) {
3016 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); 3093 pd_idx = stripe_to_pdidx(first_sector, conf,
3094 conf->previous_raid_disks);
3017 sh = get_active_stripe(conf, first_sector, 3095 sh = get_active_stripe(conf, first_sector,
3018 conf->previous_raid_disks, pd_idx, 0); 3096 conf->previous_raid_disks, pd_idx, 0);
3019 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3097 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
@@ -3348,35 +3426,44 @@ static int run(mddev_t *mddev)
3348 */ 3426 */
3349 sector_t here_new, here_old; 3427 sector_t here_new, here_old;
3350 int old_disks; 3428 int old_disks;
3429 int max_degraded = (mddev->level == 5 ? 1 : 2);
3351 3430
3352 if (mddev->new_level != mddev->level || 3431 if (mddev->new_level != mddev->level ||
3353 mddev->new_layout != mddev->layout || 3432 mddev->new_layout != mddev->layout ||
3354 mddev->new_chunk != mddev->chunk_size) { 3433 mddev->new_chunk != mddev->chunk_size) {
3355 printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n", 3434 printk(KERN_ERR "raid5: %s: unsupported reshape "
3435 "required - aborting.\n",
3356 mdname(mddev)); 3436 mdname(mddev));
3357 return -EINVAL; 3437 return -EINVAL;
3358 } 3438 }
3359 if (mddev->delta_disks <= 0) { 3439 if (mddev->delta_disks <= 0) {
3360 printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n", 3440 printk(KERN_ERR "raid5: %s: unsupported reshape "
3441 "(reduce disks) required - aborting.\n",
3361 mdname(mddev)); 3442 mdname(mddev));
3362 return -EINVAL; 3443 return -EINVAL;
3363 } 3444 }
3364 old_disks = mddev->raid_disks - mddev->delta_disks; 3445 old_disks = mddev->raid_disks - mddev->delta_disks;
3365 /* reshape_position must be on a new-stripe boundary, and one 3446 /* reshape_position must be on a new-stripe boundary, and one
3366 * further up in new geometry must map after here in old geometry. 3447 * further up in new geometry must map after here in old
3448 * geometry.
3367 */ 3449 */
3368 here_new = mddev->reshape_position; 3450 here_new = mddev->reshape_position;
3369 if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) { 3451 if (sector_div(here_new, (mddev->chunk_size>>9)*
3370 printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n"); 3452 (mddev->raid_disks - max_degraded))) {
3453 printk(KERN_ERR "raid5: reshape_position not "
3454 "on a stripe boundary\n");
3371 return -EINVAL; 3455 return -EINVAL;
3372 } 3456 }
3373 /* here_new is the stripe we will write to */ 3457 /* here_new is the stripe we will write to */
3374 here_old = mddev->reshape_position; 3458 here_old = mddev->reshape_position;
3375 sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1)); 3459 sector_div(here_old, (mddev->chunk_size>>9)*
3376 /* here_old is the first stripe that we might need to read from */ 3460 (old_disks-max_degraded));
3461 /* here_old is the first stripe that we might need to read
3462 * from */
3377 if (here_new >= here_old) { 3463 if (here_new >= here_old) {
3378 /* Reading from the same stripe as writing to - bad */ 3464 /* Reading from the same stripe as writing to - bad */
3379 printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n"); 3465 printk(KERN_ERR "raid5: reshape_position too early for "
3466 "auto-recovery - aborting.\n");
3380 return -EINVAL; 3467 return -EINVAL;
3381 } 3468 }
3382 printk(KERN_INFO "raid5: reshape will continue\n"); 3469 printk(KERN_INFO "raid5: reshape will continue\n");
@@ -3829,8 +3916,7 @@ static int raid5_start_reshape(mddev_t *mddev)
3829 int added_devices = 0; 3916 int added_devices = 0;
3830 unsigned long flags; 3917 unsigned long flags;
3831 3918
3832 if (mddev->degraded || 3919 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3833 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3834 return -EBUSY; 3920 return -EBUSY;
3835 3921
3836 ITERATE_RDEV(mddev, rdev, rtmp) 3922 ITERATE_RDEV(mddev, rdev, rtmp)
@@ -3838,7 +3924,7 @@ static int raid5_start_reshape(mddev_t *mddev)
3838 !test_bit(Faulty, &rdev->flags)) 3924 !test_bit(Faulty, &rdev->flags))
3839 spares++; 3925 spares++;
3840 3926
3841 if (spares < mddev->delta_disks-1) 3927 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
3842 /* Not enough devices even to make a degraded array 3928 /* Not enough devices even to make a degraded array
3843 * of that size 3929 * of that size
3844 */ 3930 */
@@ -3901,7 +3987,8 @@ static void end_reshape(raid5_conf_t *conf)
3901 struct block_device *bdev; 3987 struct block_device *bdev;
3902 3988
3903 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 3989 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
3904 conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1); 3990 conf->mddev->array_size = conf->mddev->size *
3991 (conf->raid_disks - conf->max_degraded);
3905 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); 3992 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
3906 conf->mddev->changed = 1; 3993 conf->mddev->changed = 1;
3907 3994
@@ -3974,6 +4061,10 @@ static struct mdk_personality raid6_personality =
3974 .spare_active = raid5_spare_active, 4061 .spare_active = raid5_spare_active,
3975 .sync_request = sync_request, 4062 .sync_request = sync_request,
3976 .resize = raid5_resize, 4063 .resize = raid5_resize,
4064#ifdef CONFIG_MD_RAID5_RESHAPE
4065 .check_reshape = raid5_check_reshape,
4066 .start_reshape = raid5_start_reshape,
4067#endif
3977 .quiesce = raid5_quiesce, 4068 .quiesce = raid5_quiesce,
3978}; 4069};
3979static struct mdk_personality raid5_personality = 4070static struct mdk_personality raid5_personality =