aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-03-27 04:18:11 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-27 11:45:01 -0500
commitf67055780caac6a99f43834795c43acf99eba6a6 (patch)
tree6b80e7b4cb300edb0910dbad1d840ff8e2f36ae5
parent292695531ae4019bb15deedc121b218d1908b648 (diff)
[PATCH] md: Checkpoint and allow restart of raid5 reshape
We allow the superblock to record an 'old' and a 'new' geometry, and a position where any conversion is up to. The geometry allows for changing chunksize, layout and level as well as number of devices. When using verion-0.90 superblock, we convert the version to 0.91 while the conversion is happening so that an old kernel will refuse the assemble the array. For version-1, we use a feature bit for the same effect. When starting an array we check for an incomplete reshape and restart the reshape process if needed. If the reshape stopped at an awkward time (like when updating the first stripe) we refuse to assemble the array, and let user-space worry about it. Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/md.c69
-rw-r--r--drivers/md/raid1.c5
-rw-r--r--drivers/md/raid5.c140
-rw-r--r--include/linux/raid/md.h2
-rw-r--r--include/linux/raid/md_k.h8
-rw-r--r--include/linux/raid/md_p.h32
-rw-r--r--include/linux/raid/raid5.h1
7 files changed, 231 insertions, 26 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d169bc964676..b9dfdfccdb78 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -662,7 +662,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
662 } 662 }
663 663
664 if (sb->major_version != 0 || 664 if (sb->major_version != 0 ||
665 sb->minor_version != 90) { 665 sb->minor_version < 90 ||
666 sb->minor_version > 91) {
666 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 667 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
667 sb->major_version, sb->minor_version, 668 sb->major_version, sb->minor_version,
668 b); 669 b);
@@ -747,6 +748,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
747 mddev->bitmap_offset = 0; 748 mddev->bitmap_offset = 0;
748 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 749 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
749 750
751 if (mddev->minor_version >= 91) {
752 mddev->reshape_position = sb->reshape_position;
753 mddev->delta_disks = sb->delta_disks;
754 mddev->new_level = sb->new_level;
755 mddev->new_layout = sb->new_layout;
756 mddev->new_chunk = sb->new_chunk;
757 } else {
758 mddev->reshape_position = MaxSector;
759 mddev->delta_disks = 0;
760 mddev->new_level = mddev->level;
761 mddev->new_layout = mddev->layout;
762 mddev->new_chunk = mddev->chunk_size;
763 }
764
750 if (sb->state & (1<<MD_SB_CLEAN)) 765 if (sb->state & (1<<MD_SB_CLEAN))
751 mddev->recovery_cp = MaxSector; 766 mddev->recovery_cp = MaxSector;
752 else { 767 else {
@@ -841,7 +856,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
841 856
842 sb->md_magic = MD_SB_MAGIC; 857 sb->md_magic = MD_SB_MAGIC;
843 sb->major_version = mddev->major_version; 858 sb->major_version = mddev->major_version;
844 sb->minor_version = mddev->minor_version;
845 sb->patch_version = mddev->patch_version; 859 sb->patch_version = mddev->patch_version;
846 sb->gvalid_words = 0; /* ignored */ 860 sb->gvalid_words = 0; /* ignored */
847 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 861 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
@@ -860,6 +874,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
860 sb->events_hi = (mddev->events>>32); 874 sb->events_hi = (mddev->events>>32);
861 sb->events_lo = (u32)mddev->events; 875 sb->events_lo = (u32)mddev->events;
862 876
877 if (mddev->reshape_position == MaxSector)
878 sb->minor_version = 90;
879 else {
880 sb->minor_version = 91;
881 sb->reshape_position = mddev->reshape_position;
882 sb->new_level = mddev->new_level;
883 sb->delta_disks = mddev->delta_disks;
884 sb->new_layout = mddev->new_layout;
885 sb->new_chunk = mddev->new_chunk;
886 }
887 mddev->minor_version = sb->minor_version;
863 if (mddev->in_sync) 888 if (mddev->in_sync)
864 { 889 {
865 sb->recovery_cp = mddev->recovery_cp; 890 sb->recovery_cp = mddev->recovery_cp;
@@ -1104,6 +1129,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1104 } 1129 }
1105 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1130 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1106 } 1131 }
1132 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1133 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1134 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1135 mddev->new_level = le32_to_cpu(sb->new_level);
1136 mddev->new_layout = le32_to_cpu(sb->new_layout);
1137 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1138 } else {
1139 mddev->reshape_position = MaxSector;
1140 mddev->delta_disks = 0;
1141 mddev->new_level = mddev->level;
1142 mddev->new_layout = mddev->layout;
1143 mddev->new_chunk = mddev->chunk_size;
1144 }
1145
1107 } else if (mddev->pers == NULL) { 1146 } else if (mddev->pers == NULL) {
1108 /* Insist of good event counter while assembling */ 1147 /* Insist of good event counter while assembling */
1109 __u64 ev1 = le64_to_cpu(sb->events); 1148 __u64 ev1 = le64_to_cpu(sb->events);
@@ -1175,6 +1214,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1175 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1214 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1176 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1215 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1177 } 1216 }
1217 if (mddev->reshape_position != MaxSector) {
1218 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1219 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1220 sb->new_layout = cpu_to_le32(mddev->new_layout);
1221 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1222 sb->new_level = cpu_to_le32(mddev->new_level);
1223 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1224 }
1178 1225
1179 max_dev = 0; 1226 max_dev = 0;
1180 ITERATE_RDEV(mddev,rdev2,tmp) 1227 ITERATE_RDEV(mddev,rdev2,tmp)
@@ -1497,7 +1544,7 @@ static void sync_sbs(mddev_t * mddev)
1497 } 1544 }
1498} 1545}
1499 1546
1500static void md_update_sb(mddev_t * mddev) 1547void md_update_sb(mddev_t * mddev)
1501{ 1548{
1502 int err; 1549 int err;
1503 struct list_head *tmp; 1550 struct list_head *tmp;
@@ -1574,6 +1621,7 @@ repeat:
1574 wake_up(&mddev->sb_wait); 1621 wake_up(&mddev->sb_wait);
1575 1622
1576} 1623}
1624EXPORT_SYMBOL_GPL(md_update_sb);
1577 1625
1578/* words written to sysfs files may, or my not, be \n terminated. 1626/* words written to sysfs files may, or my not, be \n terminated.
1579 * We want to accept with case. For this we use cmd_match. 1627 * We want to accept with case. For this we use cmd_match.
@@ -2545,6 +2593,14 @@ static int do_md_run(mddev_t * mddev)
2545 mddev->level = pers->level; 2593 mddev->level = pers->level;
2546 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2594 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2547 2595
2596 if (mddev->reshape_position != MaxSector &&
2597 pers->reshape == NULL) {
2598 /* This personality cannot handle reshaping... */
2599 mddev->pers = NULL;
2600 module_put(pers->owner);
2601 return -EINVAL;
2602 }
2603
2548 mddev->recovery = 0; 2604 mddev->recovery = 0;
2549 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2605 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
2550 mddev->barriers_work = 1; 2606 mddev->barriers_work = 1;
@@ -3433,11 +3489,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
3433 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 3489 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
3434 mddev->bitmap_offset = 0; 3490 mddev->bitmap_offset = 0;
3435 3491
3492 mddev->reshape_position = MaxSector;
3493
3436 /* 3494 /*
3437 * Generate a 128 bit UUID 3495 * Generate a 128 bit UUID
3438 */ 3496 */
3439 get_random_bytes(mddev->uuid, 16); 3497 get_random_bytes(mddev->uuid, 16);
3440 3498
3499 mddev->new_level = mddev->level;
3500 mddev->new_chunk = mddev->chunk_size;
3501 mddev->new_layout = mddev->layout;
3502 mddev->delta_disks = 0;
3503
3441 return 0; 3504 return 0;
3442} 3505}
3443 3506
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5d88329e3c7a..b65b8cfbdf30 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1789,6 +1789,11 @@ static int run(mddev_t *mddev)
1789 mdname(mddev), mddev->level); 1789 mdname(mddev), mddev->level);
1790 goto out; 1790 goto out;
1791 } 1791 }
1792 if (mddev->reshape_position != MaxSector) {
1793 printk("raid1: %s: reshape_position set but not supported\n",
1794 mdname(mddev));
1795 goto out;
1796 }
1792 /* 1797 /*
1793 * copy the already verified devices into our private RAID1 1798 * copy the already verified devices into our private RAID1
1794 * bookkeeping area. [whatever we allocate in run(), 1799 * bookkeeping area. [whatever we allocate in run(),
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b29135acb1d9..20ae32d67e21 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -22,6 +22,7 @@
22#include <linux/raid/raid5.h> 22#include <linux/raid/raid5.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/bitops.h> 24#include <linux/bitops.h>
25#include <linux/kthread.h>
25#include <asm/atomic.h> 26#include <asm/atomic.h>
26 27
27#include <linux/raid/bitmap.h> 28#include <linux/raid/bitmap.h>
@@ -1504,6 +1505,7 @@ static void handle_stripe(struct stripe_head *sh)
1504 clear_bit(STRIPE_EXPANDING, &sh->state); 1505 clear_bit(STRIPE_EXPANDING, &sh->state);
1505 } else if (expanded) { 1506 } else if (expanded) {
1506 clear_bit(STRIPE_EXPAND_READY, &sh->state); 1507 clear_bit(STRIPE_EXPAND_READY, &sh->state);
1508 atomic_dec(&conf->reshape_stripes);
1507 wake_up(&conf->wait_for_overlap); 1509 wake_up(&conf->wait_for_overlap);
1508 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 1510 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
1509 } 1511 }
@@ -1875,6 +1877,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1875 */ 1877 */
1876 int i; 1878 int i;
1877 int dd_idx; 1879 int dd_idx;
1880
1881 if (sector_nr == 0 &&
1882 conf->expand_progress != 0) {
1883 /* restarting in the middle, skip the initial sectors */
1884 sector_nr = conf->expand_progress;
1885 sector_div(sector_nr, conf->raid_disks-1);
1886 *skipped = 1;
1887 return sector_nr;
1888 }
1889
1890 /* Cannot proceed until we've updated the superblock... */
1891 wait_event(conf->wait_for_overlap,
1892 atomic_read(&conf->reshape_stripes)==0);
1893 mddev->reshape_position = conf->expand_progress;
1894
1895 mddev->sb_dirty = 1;
1896 md_wakeup_thread(mddev->thread);
1897 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
1898 kthread_should_stop());
1899
1878 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { 1900 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
1879 int j; 1901 int j;
1880 int skipped = 0; 1902 int skipped = 0;
@@ -1882,6 +1904,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1882 sh = get_active_stripe(conf, sector_nr+i, 1904 sh = get_active_stripe(conf, sector_nr+i,
1883 conf->raid_disks, pd_idx, 0); 1905 conf->raid_disks, pd_idx, 0);
1884 set_bit(STRIPE_EXPANDING, &sh->state); 1906 set_bit(STRIPE_EXPANDING, &sh->state);
1907 atomic_inc(&conf->reshape_stripes);
1885 /* If any of this stripe is beyond the end of the old 1908 /* If any of this stripe is beyond the end of the old
1886 * array, then we need to zero those blocks 1909 * array, then we need to zero those blocks
1887 */ 1910 */
@@ -2121,10 +2144,61 @@ static int run(mddev_t *mddev)
2121 return -EIO; 2144 return -EIO;
2122 } 2145 }
2123 2146
2147 if (mddev->reshape_position != MaxSector) {
2148 /* Check that we can continue the reshape.
2149 * Currently only disks can change, it must
2150 * increase, and we must be past the point where
2151 * a stripe over-writes itself
2152 */
2153 sector_t here_new, here_old;
2154 int old_disks;
2155
2156 if (mddev->new_level != mddev->level ||
2157 mddev->new_layout != mddev->layout ||
2158 mddev->new_chunk != mddev->chunk_size) {
2159 printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
2160 mdname(mddev));
2161 return -EINVAL;
2162 }
2163 if (mddev->delta_disks <= 0) {
2164 printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
2165 mdname(mddev));
2166 return -EINVAL;
2167 }
2168 old_disks = mddev->raid_disks - mddev->delta_disks;
2169 /* reshape_position must be on a new-stripe boundary, and one
2170 * further up in new geometry must map after here in old geometry.
2171 */
2172 here_new = mddev->reshape_position;
2173 if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
2174 printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
2175 return -EINVAL;
2176 }
2177 /* here_new is the stripe we will write to */
2178 here_old = mddev->reshape_position;
2179 sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
2180 /* here_old is the first stripe that we might need to read from */
2181 if (here_new >= here_old) {
2182 /* Reading from the same stripe as writing to - bad */
2183 printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
2184 return -EINVAL;
2185 }
2186 printk(KERN_INFO "raid5: reshape will continue\n");
2187 /* OK, we should be able to continue; */
2188 }
2189
2190
2124 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); 2191 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
2125 if ((conf = mddev->private) == NULL) 2192 if ((conf = mddev->private) == NULL)
2126 goto abort; 2193 goto abort;
2127 conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info), 2194 if (mddev->reshape_position == MaxSector) {
2195 conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
2196 } else {
2197 conf->raid_disks = mddev->raid_disks;
2198 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
2199 }
2200
2201 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
2128 GFP_KERNEL); 2202 GFP_KERNEL);
2129 if (!conf->disks) 2203 if (!conf->disks)
2130 goto abort; 2204 goto abort;
@@ -2148,7 +2222,7 @@ static int run(mddev_t *mddev)
2148 2222
2149 ITERATE_RDEV(mddev,rdev,tmp) { 2223 ITERATE_RDEV(mddev,rdev,tmp) {
2150 raid_disk = rdev->raid_disk; 2224 raid_disk = rdev->raid_disk;
2151 if (raid_disk >= mddev->raid_disks 2225 if (raid_disk >= conf->raid_disks
2152 || raid_disk < 0) 2226 || raid_disk < 0)
2153 continue; 2227 continue;
2154 disk = conf->disks + raid_disk; 2228 disk = conf->disks + raid_disk;
@@ -2164,7 +2238,6 @@ static int run(mddev_t *mddev)
2164 } 2238 }
2165 } 2239 }
2166 2240
2167 conf->raid_disks = mddev->raid_disks;
2168 /* 2241 /*
2169 * 0 for a fully functional array, 1 for a degraded array. 2242 * 0 for a fully functional array, 1 for a degraded array.
2170 */ 2243 */
@@ -2174,7 +2247,7 @@ static int run(mddev_t *mddev)
2174 conf->level = mddev->level; 2247 conf->level = mddev->level;
2175 conf->algorithm = mddev->layout; 2248 conf->algorithm = mddev->layout;
2176 conf->max_nr_stripes = NR_STRIPES; 2249 conf->max_nr_stripes = NR_STRIPES;
2177 conf->expand_progress = MaxSector; 2250 conf->expand_progress = mddev->reshape_position;
2178 2251
2179 /* device size must be a multiple of chunk size */ 2252 /* device size must be a multiple of chunk size */
2180 mddev->size &= ~(mddev->chunk_size/1024 -1); 2253 mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2247,6 +2320,20 @@ static int run(mddev_t *mddev)
2247 2320
2248 print_raid5_conf(conf); 2321 print_raid5_conf(conf);
2249 2322
2323 if (conf->expand_progress != MaxSector) {
2324 printk("...ok start reshape thread\n");
2325 atomic_set(&conf->reshape_stripes, 0);
2326 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2327 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2328 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
2329 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
2330 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
2331 "%s_reshape");
2332 /* FIXME if md_register_thread fails?? */
2333 md_wakeup_thread(mddev->sync_thread);
2334
2335 }
2336
2250 /* read-ahead size must cover two whole stripes, which is 2337 /* read-ahead size must cover two whole stripes, which is
2251 * 2 * (n-1) * chunksize where 'n' is the number of raid devices 2338 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
2252 */ 2339 */
@@ -2262,8 +2349,8 @@ static int run(mddev_t *mddev)
2262 2349
2263 mddev->queue->unplug_fn = raid5_unplug_device; 2350 mddev->queue->unplug_fn = raid5_unplug_device;
2264 mddev->queue->issue_flush_fn = raid5_issue_flush; 2351 mddev->queue->issue_flush_fn = raid5_issue_flush;
2352 mddev->array_size = mddev->size * (conf->previous_raid_disks - 1);
2265 2353
2266 mddev->array_size = mddev->size * (mddev->raid_disks - 1);
2267 return 0; 2354 return 0;
2268abort: 2355abort:
2269 if (conf) { 2356 if (conf) {
@@ -2436,7 +2523,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2436 /* 2523 /*
2437 * find the disk ... 2524 * find the disk ...
2438 */ 2525 */
2439 for (disk=0; disk < mddev->raid_disks; disk++) 2526 for (disk=0; disk < conf->raid_disks; disk++)
2440 if ((p=conf->disks + disk)->rdev == NULL) { 2527 if ((p=conf->disks + disk)->rdev == NULL) {
2441 clear_bit(In_sync, &rdev->flags); 2528 clear_bit(In_sync, &rdev->flags);
2442 rdev->raid_disk = disk; 2529 rdev->raid_disk = disk;
@@ -2518,9 +2605,10 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
2518 if (err) 2605 if (err)
2519 return err; 2606 return err;
2520 2607
2608 atomic_set(&conf->reshape_stripes, 0);
2521 spin_lock_irq(&conf->device_lock); 2609 spin_lock_irq(&conf->device_lock);
2522 conf->previous_raid_disks = conf->raid_disks; 2610 conf->previous_raid_disks = conf->raid_disks;
2523 mddev->raid_disks = conf->raid_disks = raid_disks; 2611 conf->raid_disks = raid_disks;
2524 conf->expand_progress = 0; 2612 conf->expand_progress = 0;
2525 spin_unlock_irq(&conf->device_lock); 2613 spin_unlock_irq(&conf->device_lock);
2526 2614
@@ -2542,6 +2630,14 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
2542 } 2630 }
2543 2631
2544 mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices; 2632 mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
2633 mddev->new_chunk = mddev->chunk_size;
2634 mddev->new_layout = mddev->layout;
2635 mddev->new_level = mddev->level;
2636 mddev->raid_disks = raid_disks;
2637 mddev->delta_disks = raid_disks - conf->previous_raid_disks;
2638 mddev->reshape_position = 0;
2639 mddev->sb_dirty = 1;
2640
2545 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2641 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2546 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2642 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2547 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 2643 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -2552,6 +2648,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
2552 mddev->recovery = 0; 2648 mddev->recovery = 0;
2553 spin_lock_irq(&conf->device_lock); 2649 spin_lock_irq(&conf->device_lock);
2554 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 2650 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
2651 mddev->delta_disks = 0;
2555 conf->expand_progress = MaxSector; 2652 conf->expand_progress = MaxSector;
2556 spin_unlock_irq(&conf->device_lock); 2653 spin_unlock_irq(&conf->device_lock);
2557 return -EAGAIN; 2654 return -EAGAIN;
@@ -2566,20 +2663,23 @@ static void end_reshape(raid5_conf_t *conf)
2566{ 2663{
2567 struct block_device *bdev; 2664 struct block_device *bdev;
2568 2665
2569 conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1); 2666 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
2570 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); 2667 conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
2571 conf->mddev->changed = 1; 2668 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
2572 2669 conf->mddev->changed = 1;
2573 bdev = bdget_disk(conf->mddev->gendisk, 0); 2670
2574 if (bdev) { 2671 bdev = bdget_disk(conf->mddev->gendisk, 0);
2575 mutex_lock(&bdev->bd_inode->i_mutex); 2672 if (bdev) {
2576 i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); 2673 mutex_lock(&bdev->bd_inode->i_mutex);
2577 mutex_unlock(&bdev->bd_inode->i_mutex); 2674 i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
2578 bdput(bdev); 2675 mutex_unlock(&bdev->bd_inode->i_mutex);
2676 bdput(bdev);
2677 }
2678 spin_lock_irq(&conf->device_lock);
2679 conf->expand_progress = MaxSector;
2680 spin_unlock_irq(&conf->device_lock);
2681 conf->mddev->reshape_position = MaxSector;
2579 } 2682 }
2580 spin_lock_irq(&conf->device_lock);
2581 conf->expand_progress = MaxSector;
2582 spin_unlock_irq(&conf->device_lock);
2583} 2683}
2584 2684
2585static void raid5_quiesce(mddev_t *mddev, int state) 2685static void raid5_quiesce(mddev_t *mddev, int state)
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 9c77cde5a795..66b44e5e0d6e 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,6 +95,8 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
95extern void md_do_sync(mddev_t *mddev); 95extern void md_do_sync(mddev_t *mddev);
96extern void md_new_event(mddev_t *mddev); 96extern void md_new_event(mddev_t *mddev);
97 97
98extern void md_update_sb(mddev_t * mddev);
99
98#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 100#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
99 101
100#endif 102#endif
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4e26ef2cacca..1a6f9f2f6282 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -132,6 +132,14 @@ struct mddev_s
132 132
133 char uuid[16]; 133 char uuid[16];
134 134
135 /* If the array is being reshaped, we need to record the
136 * new shape and an indication of where we are up to.
137 * This is written to the superblock.
138 * If reshape_position is MaxSector, then no reshape is happening (yet).
139 */
140 sector_t reshape_position;
141 int delta_disks, new_level, new_layout, new_chunk;
142
135 struct mdk_thread_s *thread; /* management thread */ 143 struct mdk_thread_s *thread; /* management thread */
136 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ 144 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
137 sector_t curr_resync; /* blocks scheduled */ 145 sector_t curr_resync; /* blocks scheduled */
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index c100fa5d4bfa..774e1acfb8c4 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -102,6 +102,18 @@ typedef struct mdp_device_descriptor_s {
102#define MD_SB_ERRORS 1 102#define MD_SB_ERRORS 1
103 103
104#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ 104#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */
105
106/*
107 * Notes:
108 * - if an array is being reshaped (restriped) in order to change the
109 * the number of active devices in the array, 'raid_disks' will be
110 * the larger of the old and new numbers. 'delta_disks' will
111 * be the "new - old". So if +ve, raid_disks is the new value, and
112 * "raid_disks-delta_disks" is the old. If -ve, raid_disks is the
113 * old value and "raid_disks+delta_disks" is the new (smaller) value.
114 */
115
116
105typedef struct mdp_superblock_s { 117typedef struct mdp_superblock_s {
106 /* 118 /*
107 * Constant generic information 119 * Constant generic information
@@ -146,7 +158,13 @@ typedef struct mdp_superblock_s {
146 __u32 cp_events_hi; /* 10 high-order of checkpoint update count */ 158 __u32 cp_events_hi; /* 10 high-order of checkpoint update count */
147#endif 159#endif
148 __u32 recovery_cp; /* 11 recovery checkpoint sector count */ 160 __u32 recovery_cp; /* 11 recovery checkpoint sector count */
149 __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 12]; 161 /* There are only valid for minor_version > 90 */
162 __u64 reshape_position; /* 12,13 next address in array-space for reshape */
163 __u32 new_level; /* 14 new level we are reshaping to */
164 __u32 delta_disks; /* 15 change in number of raid_disks */
165 __u32 new_layout; /* 16 new layout */
166 __u32 new_chunk; /* 17 new chunk size (bytes) */
167 __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18];
150 168
151 /* 169 /*
152 * Personality information 170 * Personality information
@@ -207,7 +225,14 @@ struct mdp_superblock_1 {
207 * NOTE: signed, so bitmap can be before superblock 225 * NOTE: signed, so bitmap can be before superblock
208 * only meaningful of feature_map[0] is set. 226 * only meaningful of feature_map[0] is set.
209 */ 227 */
210 __u8 pad1[128-100]; /* set to 0 when written */ 228
229 /* These are only valid with feature bit '4' */
230 __u64 reshape_position; /* next address in array-space for reshape */
231 __u32 new_level; /* new level we are reshaping to */
232 __u32 delta_disks; /* change in number of raid_disks */
233 __u32 new_layout; /* new layout */
234 __u32 new_chunk; /* new chunk size (bytes) */
235 __u8 pad1[128-124]; /* set to 0 when written */
211 236
212 /* constant this-device information - 64 bytes */ 237 /* constant this-device information - 64 bytes */
213 __u64 data_offset; /* sector start of data, often 0 */ 238 __u64 data_offset; /* sector start of data, often 0 */
@@ -240,8 +265,9 @@ struct mdp_superblock_1 {
240 265
241/* feature_map bits */ 266/* feature_map bits */
242#define MD_FEATURE_BITMAP_OFFSET 1 267#define MD_FEATURE_BITMAP_OFFSET 1
268#define MD_FEATURE_RESHAPE_ACTIVE 4
243 269
244#define MD_FEATURE_ALL 1 270#define MD_FEATURE_ALL 5
245 271
246#endif 272#endif
247 273
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 55c738d50508..abcdf0d0658a 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -224,6 +224,7 @@ struct raid5_private_data {
224 struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ 224 struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
225 atomic_t preread_active_stripes; /* stripes with scheduled io */ 225 atomic_t preread_active_stripes; /* stripes with scheduled io */
226 226
227 atomic_t reshape_stripes; /* stripes with pending writes for reshape */
227 /* unfortunately we need two cache names as we temporarily have 228 /* unfortunately we need two cache names as we temporarily have
228 * two caches. 229 * two caches.
229 */ 230 */