aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-03-27 04:18:11 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-27 11:45:01 -0500
commitf67055780caac6a99f43834795c43acf99eba6a6 (patch)
tree6b80e7b4cb300edb0910dbad1d840ff8e2f36ae5 /drivers/md/raid5.c
parent292695531ae4019bb15deedc121b218d1908b648 (diff)
[PATCH] md: Checkpoint and allow restart of raid5 reshape
We allow the superblock to record an 'old' and a 'new' geometry, and a position where any conversion is up to. The geometry allows for changing chunksize, layout and level as well as number of devices. When using verion-0.90 superblock, we convert the version to 0.91 while the conversion is happening so that an old kernel will refuse the assemble the array. For version-1, we use a feature bit for the same effect. When starting an array we check for an incomplete reshape and restart the reshape process if needed. If the reshape stopped at an awkward time (like when updating the first stripe) we refuse to assemble the array, and let user-space worry about it. Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c140
1 files changed, 120 insertions, 20 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b29135acb1d9..20ae32d67e21 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -22,6 +22,7 @@
22#include <linux/raid/raid5.h> 22#include <linux/raid/raid5.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/bitops.h> 24#include <linux/bitops.h>
25#include <linux/kthread.h>
25#include <asm/atomic.h> 26#include <asm/atomic.h>
26 27
27#include <linux/raid/bitmap.h> 28#include <linux/raid/bitmap.h>
@@ -1504,6 +1505,7 @@ static void handle_stripe(struct stripe_head *sh)
1504 clear_bit(STRIPE_EXPANDING, &sh->state); 1505 clear_bit(STRIPE_EXPANDING, &sh->state);
1505 } else if (expanded) { 1506 } else if (expanded) {
1506 clear_bit(STRIPE_EXPAND_READY, &sh->state); 1507 clear_bit(STRIPE_EXPAND_READY, &sh->state);
1508 atomic_dec(&conf->reshape_stripes);
1507 wake_up(&conf->wait_for_overlap); 1509 wake_up(&conf->wait_for_overlap);
1508 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 1510 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
1509 } 1511 }
@@ -1875,6 +1877,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1875 */ 1877 */
1876 int i; 1878 int i;
1877 int dd_idx; 1879 int dd_idx;
1880
1881 if (sector_nr == 0 &&
1882 conf->expand_progress != 0) {
1883 /* restarting in the middle, skip the initial sectors */
1884 sector_nr = conf->expand_progress;
1885 sector_div(sector_nr, conf->raid_disks-1);
1886 *skipped = 1;
1887 return sector_nr;
1888 }
1889
1890 /* Cannot proceed until we've updated the superblock... */
1891 wait_event(conf->wait_for_overlap,
1892 atomic_read(&conf->reshape_stripes)==0);
1893 mddev->reshape_position = conf->expand_progress;
1894
1895 mddev->sb_dirty = 1;
1896 md_wakeup_thread(mddev->thread);
1897 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
1898 kthread_should_stop());
1899
1878 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { 1900 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
1879 int j; 1901 int j;
1880 int skipped = 0; 1902 int skipped = 0;
@@ -1882,6 +1904,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1882 sh = get_active_stripe(conf, sector_nr+i, 1904 sh = get_active_stripe(conf, sector_nr+i,
1883 conf->raid_disks, pd_idx, 0); 1905 conf->raid_disks, pd_idx, 0);
1884 set_bit(STRIPE_EXPANDING, &sh->state); 1906 set_bit(STRIPE_EXPANDING, &sh->state);
1907 atomic_inc(&conf->reshape_stripes);
1885 /* If any of this stripe is beyond the end of the old 1908 /* If any of this stripe is beyond the end of the old
1886 * array, then we need to zero those blocks 1909 * array, then we need to zero those blocks
1887 */ 1910 */
@@ -2121,10 +2144,61 @@ static int run(mddev_t *mddev)
2121 return -EIO; 2144 return -EIO;
2122 } 2145 }
2123 2146
2147 if (mddev->reshape_position != MaxSector) {
2148 /* Check that we can continue the reshape.
2149 * Currently only disks can change, it must
2150 * increase, and we must be past the point where
2151 * a stripe over-writes itself
2152 */
2153 sector_t here_new, here_old;
2154 int old_disks;
2155
2156 if (mddev->new_level != mddev->level ||
2157 mddev->new_layout != mddev->layout ||
2158 mddev->new_chunk != mddev->chunk_size) {
2159 printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
2160 mdname(mddev));
2161 return -EINVAL;
2162 }
2163 if (mddev->delta_disks <= 0) {
2164 printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
2165 mdname(mddev));
2166 return -EINVAL;
2167 }
2168 old_disks = mddev->raid_disks - mddev->delta_disks;
2169 /* reshape_position must be on a new-stripe boundary, and one
2170 * further up in new geometry must map after here in old geometry.
2171 */
2172 here_new = mddev->reshape_position;
2173 if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
2174 printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
2175 return -EINVAL;
2176 }
2177 /* here_new is the stripe we will write to */
2178 here_old = mddev->reshape_position;
2179 sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
2180 /* here_old is the first stripe that we might need to read from */
2181 if (here_new >= here_old) {
2182 /* Reading from the same stripe as writing to - bad */
2183 printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
2184 return -EINVAL;
2185 }
2186 printk(KERN_INFO "raid5: reshape will continue\n");
2187 /* OK, we should be able to continue; */
2188 }
2189
2190
2124 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); 2191 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
2125 if ((conf = mddev->private) == NULL) 2192 if ((conf = mddev->private) == NULL)
2126 goto abort; 2193 goto abort;
2127 conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info), 2194 if (mddev->reshape_position == MaxSector) {
2195 conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
2196 } else {
2197 conf->raid_disks = mddev->raid_disks;
2198 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
2199 }
2200
2201 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
2128 GFP_KERNEL); 2202 GFP_KERNEL);
2129 if (!conf->disks) 2203 if (!conf->disks)
2130 goto abort; 2204 goto abort;
@@ -2148,7 +2222,7 @@ static int run(mddev_t *mddev)
2148 2222
2149 ITERATE_RDEV(mddev,rdev,tmp) { 2223 ITERATE_RDEV(mddev,rdev,tmp) {
2150 raid_disk = rdev->raid_disk; 2224 raid_disk = rdev->raid_disk;
2151 if (raid_disk >= mddev->raid_disks 2225 if (raid_disk >= conf->raid_disks
2152 || raid_disk < 0) 2226 || raid_disk < 0)
2153 continue; 2227 continue;
2154 disk = conf->disks + raid_disk; 2228 disk = conf->disks + raid_disk;
@@ -2164,7 +2238,6 @@ static int run(mddev_t *mddev)
2164 } 2238 }
2165 } 2239 }
2166 2240
2167 conf->raid_disks = mddev->raid_disks;
2168 /* 2241 /*
2169 * 0 for a fully functional array, 1 for a degraded array. 2242 * 0 for a fully functional array, 1 for a degraded array.
2170 */ 2243 */
@@ -2174,7 +2247,7 @@ static int run(mddev_t *mddev)
2174 conf->level = mddev->level; 2247 conf->level = mddev->level;
2175 conf->algorithm = mddev->layout; 2248 conf->algorithm = mddev->layout;
2176 conf->max_nr_stripes = NR_STRIPES; 2249 conf->max_nr_stripes = NR_STRIPES;
2177 conf->expand_progress = MaxSector; 2250 conf->expand_progress = mddev->reshape_position;
2178 2251
2179 /* device size must be a multiple of chunk size */ 2252 /* device size must be a multiple of chunk size */
2180 mddev->size &= ~(mddev->chunk_size/1024 -1); 2253 mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2247,6 +2320,20 @@ static int run(mddev_t *mddev)
2247 2320
2248 print_raid5_conf(conf); 2321 print_raid5_conf(conf);
2249 2322
2323 if (conf->expand_progress != MaxSector) {
2324 printk("...ok start reshape thread\n");
2325 atomic_set(&conf->reshape_stripes, 0);
2326 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2327 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2328 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
2329 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
2330 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
2331 "%s_reshape");
2332 /* FIXME if md_register_thread fails?? */
2333 md_wakeup_thread(mddev->sync_thread);
2334
2335 }
2336
2250 /* read-ahead size must cover two whole stripes, which is 2337 /* read-ahead size must cover two whole stripes, which is
2251 * 2 * (n-1) * chunksize where 'n' is the number of raid devices 2338 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
2252 */ 2339 */
@@ -2262,8 +2349,8 @@ static int run(mddev_t *mddev)
2262 2349
2263 mddev->queue->unplug_fn = raid5_unplug_device; 2350 mddev->queue->unplug_fn = raid5_unplug_device;
2264 mddev->queue->issue_flush_fn = raid5_issue_flush; 2351 mddev->queue->issue_flush_fn = raid5_issue_flush;
2352 mddev->array_size = mddev->size * (conf->previous_raid_disks - 1);
2265 2353
2266 mddev->array_size = mddev->size * (mddev->raid_disks - 1);
2267 return 0; 2354 return 0;
2268abort: 2355abort:
2269 if (conf) { 2356 if (conf) {
@@ -2436,7 +2523,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2436 /* 2523 /*
2437 * find the disk ... 2524 * find the disk ...
2438 */ 2525 */
2439 for (disk=0; disk < mddev->raid_disks; disk++) 2526 for (disk=0; disk < conf->raid_disks; disk++)
2440 if ((p=conf->disks + disk)->rdev == NULL) { 2527 if ((p=conf->disks + disk)->rdev == NULL) {
2441 clear_bit(In_sync, &rdev->flags); 2528 clear_bit(In_sync, &rdev->flags);
2442 rdev->raid_disk = disk; 2529 rdev->raid_disk = disk;
@@ -2518,9 +2605,10 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
2518 if (err) 2605 if (err)
2519 return err; 2606 return err;
2520 2607
2608 atomic_set(&conf->reshape_stripes, 0);
2521 spin_lock_irq(&conf->device_lock); 2609 spin_lock_irq(&conf->device_lock);
2522 conf->previous_raid_disks = conf->raid_disks; 2610 conf->previous_raid_disks = conf->raid_disks;
2523 mddev->raid_disks = conf->raid_disks = raid_disks; 2611 conf->raid_disks = raid_disks;
2524 conf->expand_progress = 0; 2612 conf->expand_progress = 0;
2525 spin_unlock_irq(&conf->device_lock); 2613 spin_unlock_irq(&conf->device_lock);
2526 2614
@@ -2542,6 +2630,14 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
2542 } 2630 }
2543 2631
2544 mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices; 2632 mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
2633 mddev->new_chunk = mddev->chunk_size;
2634 mddev->new_layout = mddev->layout;
2635 mddev->new_level = mddev->level;
2636 mddev->raid_disks = raid_disks;
2637 mddev->delta_disks = raid_disks - conf->previous_raid_disks;
2638 mddev->reshape_position = 0;
2639 mddev->sb_dirty = 1;
2640
2545 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2641 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2546 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2642 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2547 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 2643 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -2552,6 +2648,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
2552 mddev->recovery = 0; 2648 mddev->recovery = 0;
2553 spin_lock_irq(&conf->device_lock); 2649 spin_lock_irq(&conf->device_lock);
2554 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 2650 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
2651 mddev->delta_disks = 0;
2555 conf->expand_progress = MaxSector; 2652 conf->expand_progress = MaxSector;
2556 spin_unlock_irq(&conf->device_lock); 2653 spin_unlock_irq(&conf->device_lock);
2557 return -EAGAIN; 2654 return -EAGAIN;
@@ -2566,20 +2663,23 @@ static void end_reshape(raid5_conf_t *conf)
2566{ 2663{
2567 struct block_device *bdev; 2664 struct block_device *bdev;
2568 2665
2569 conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1); 2666 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
2570 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); 2667 conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
2571 conf->mddev->changed = 1; 2668 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
2572 2669 conf->mddev->changed = 1;
2573 bdev = bdget_disk(conf->mddev->gendisk, 0); 2670
2574 if (bdev) { 2671 bdev = bdget_disk(conf->mddev->gendisk, 0);
2575 mutex_lock(&bdev->bd_inode->i_mutex); 2672 if (bdev) {
2576 i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); 2673 mutex_lock(&bdev->bd_inode->i_mutex);
2577 mutex_unlock(&bdev->bd_inode->i_mutex); 2674 i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
2578 bdput(bdev); 2675 mutex_unlock(&bdev->bd_inode->i_mutex);
2676 bdput(bdev);
2677 }
2678 spin_lock_irq(&conf->device_lock);
2679 conf->expand_progress = MaxSector;
2680 spin_unlock_irq(&conf->device_lock);
2681 conf->mddev->reshape_position = MaxSector;
2579 } 2682 }
2580 spin_lock_irq(&conf->device_lock);
2581 conf->expand_progress = MaxSector;
2582 spin_unlock_irq(&conf->device_lock);
2583} 2683}
2584 2684
2585static void raid5_quiesce(mddev_t *mddev, int state) 2685static void raid5_quiesce(mddev_t *mddev, int state)