aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-03-27 04:18:10 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-27 11:45:01 -0500
commit292695531ae4019bb15deedc121b218d1908b648 (patch)
treefb205eae13c3f3410f6ea44557b1c96b075a4d44
parentccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e (diff)
[PATCH] md: Final stages of raid5 expand code
This patch adds raid5_reshape and end_reshape which will start and finish the reshape processes. raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage accidental use. Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry. and Make sure that you have backups, just in case. Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/Kconfig26
-rw-r--r--drivers/md/md.c6
-rw-r--r--drivers/md/raid5.c123
-rw-r--r--include/linux/raid/md.h3
4 files changed, 154 insertions, 4 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac43f98062fd..fd2aae150ccc 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -127,6 +127,32 @@ config MD_RAID5
127 127
128 If unsure, say Y. 128 If unsure, say Y.
129 129
130config MD_RAID5_RESHAPE
131 bool "Support adding drives to a raid-5 array (experimental)"
132 depends on MD_RAID5 && EXPERIMENTAL
133 ---help---
134 A RAID-5 set can be expanded by adding extra drives. This
135 requires "restriping" the array which means (almost) every
136 block must be written to a different place.
137
138 This option allows such restriping to be done while the array
139 is online. However it is still EXPERIMENTAL code. It should
140 work, but please be sure that you have backups.
141
142 You will need a version of mdadm newer than 2.3.1. During the
143 early stage of reshape there is a critical section where live data
144 is being over-written. A crash during this time needs extra care
145 for recovery. The newer mdadm takes a copy of the data in the
146 critical section and will restore it, if necessary, after a crash.
147
148 The mdadm usage is e.g.
149 mdadm --grow /dev/md1 --raid-disks=6
150 to grow '/dev/md1' to having 6 disks.
151
152 Note: The array can only be expanded, not contracted.
153 There should be enough spares already present to make the new
154 array workable.
155
130config MD_RAID6 156config MD_RAID6
131 tristate "RAID-6 mode" 157 tristate "RAID-6 mode"
132 depends on BLK_DEV_MD 158 depends on BLK_DEV_MD
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e65986bc63f..d169bc964676 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -158,11 +158,12 @@ static int start_readonly;
158 */ 158 */
159static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 159static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
160static atomic_t md_event_count; 160static atomic_t md_event_count;
161static void md_new_event(mddev_t *mddev) 161void md_new_event(mddev_t *mddev)
162{ 162{
163 atomic_inc(&md_event_count); 163 atomic_inc(&md_event_count);
164 wake_up(&md_event_waiters); 164 wake_up(&md_event_waiters);
165} 165}
166EXPORT_SYMBOL_GPL(md_new_event);
166 167
167/* 168/*
168 * Enables to iterate over all existing md arrays 169 * Enables to iterate over all existing md arrays
@@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
4467 4468
4468#define SYNC_MARKS 10 4469#define SYNC_MARKS 10
4469#define SYNC_MARK_STEP (3*HZ) 4470#define SYNC_MARK_STEP (3*HZ)
4470static void md_do_sync(mddev_t *mddev) 4471void md_do_sync(mddev_t *mddev)
4471{ 4472{
4472 mddev_t *mddev2; 4473 mddev_t *mddev2;
4473 unsigned int currspeed = 0, 4474 unsigned int currspeed = 0,
@@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev)
4704 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 4705 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
4705 md_wakeup_thread(mddev->thread); 4706 md_wakeup_thread(mddev->thread);
4706} 4707}
4708EXPORT_SYMBOL_GPL(md_do_sync);
4707 4709
4708 4710
4709/* 4711/*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 56cba8d3e398..b29135acb1d9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num)
331 } 331 }
332 return 0; 332 return 0;
333} 333}
334
335#ifdef CONFIG_MD_RAID5_RESHAPE
334static int resize_stripes(raid5_conf_t *conf, int newsize) 336static int resize_stripes(raid5_conf_t *conf, int newsize)
335{ 337{
336 /* Make all the stripes able to hold 'newsize' devices. 338 /* Make all the stripes able to hold 'newsize' devices.
@@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
451 conf->pool_size = newsize; 453 conf->pool_size = newsize;
452 return err; 454 return err;
453} 455}
454 456#endif
455 457
456static int drop_one_stripe(raid5_conf_t *conf) 458static int drop_one_stripe(raid5_conf_t *conf)
457{ 459{
@@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1034 return 0; 1036 return 0;
1035} 1037}
1036 1038
1039static void end_reshape(raid5_conf_t *conf);
1040
1037static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) 1041static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1038{ 1042{
1039 int sectors_per_chunk = conf->chunk_size >> 9; 1043 int sectors_per_chunk = conf->chunk_size >> 9;
@@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1844 if (sector_nr >= max_sector) { 1848 if (sector_nr >= max_sector) {
1845 /* just being told to finish up .. nothing much to do */ 1849 /* just being told to finish up .. nothing much to do */
1846 unplug_slaves(mddev); 1850 unplug_slaves(mddev);
1851 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
1852 end_reshape(conf);
1853 return 0;
1854 }
1847 1855
1848 if (mddev->curr_resync < max_sector) /* aborted */ 1856 if (mddev->curr_resync < max_sector) /* aborted */
1849 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 1857 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
@@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
2464 return 0; 2472 return 0;
2465} 2473}
2466 2474
2475#ifdef CONFIG_MD_RAID5_RESHAPE
2476static int raid5_reshape(mddev_t *mddev, int raid_disks)
2477{
2478 raid5_conf_t *conf = mddev_to_conf(mddev);
2479 int err;
2480 mdk_rdev_t *rdev;
2481 struct list_head *rtmp;
2482 int spares = 0;
2483 int added_devices = 0;
2484
2485 if (mddev->degraded ||
2486 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2487 return -EBUSY;
2488 if (conf->raid_disks > raid_disks)
2489 return -EINVAL; /* Cannot shrink array yet */
2490 if (conf->raid_disks == raid_disks)
2491 return 0; /* nothing to do */
2492
2493 /* Can only proceed if there are plenty of stripe_heads.
2494 * We need a minimum of one full stripe,, and for sensible progress
2495 * it is best to have about 4 times that.
2496 * If we require 4 times, then the default 256 4K stripe_heads will
2497 * allow for chunk sizes up to 256K, which is probably OK.
2498 * If the chunk size is greater, user-space should request more
2499 * stripe_heads first.
2500 */
2501 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
2502 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
2503 (mddev->chunk_size / STRIPE_SIZE)*4);
2504 return -ENOSPC;
2505 }
2506
2507 ITERATE_RDEV(mddev, rdev, rtmp)
2508 if (rdev->raid_disk < 0 &&
2509 !test_bit(Faulty, &rdev->flags))
2510 spares++;
2511 if (conf->raid_disks + spares < raid_disks-1)
2512 /* Not enough devices even to make a degraded array
2513 * of that size
2514 */
2515 return -EINVAL;
2516
2517 err = resize_stripes(conf, raid_disks);
2518 if (err)
2519 return err;
2520
2521 spin_lock_irq(&conf->device_lock);
2522 conf->previous_raid_disks = conf->raid_disks;
2523 mddev->raid_disks = conf->raid_disks = raid_disks;
2524 conf->expand_progress = 0;
2525 spin_unlock_irq(&conf->device_lock);
2526
2527 /* Add some new drives, as many as will fit.
2528 * We know there are enough to make the newly sized array work.
2529 */
2530 ITERATE_RDEV(mddev, rdev, rtmp)
2531 if (rdev->raid_disk < 0 &&
2532 !test_bit(Faulty, &rdev->flags)) {
2533 if (raid5_add_disk(mddev, rdev)) {
2534 char nm[20];
2535 set_bit(In_sync, &rdev->flags);
2536 conf->working_disks++;
2537 added_devices++;
2538 sprintf(nm, "rd%d", rdev->raid_disk);
2539 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
2540 } else
2541 break;
2542 }
2543
2544 mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
2545 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2546 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2547 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
2548 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
2549 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
2550 "%s_reshape");
2551 if (!mddev->sync_thread) {
2552 mddev->recovery = 0;
2553 spin_lock_irq(&conf->device_lock);
2554 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
2555 conf->expand_progress = MaxSector;
2556 spin_unlock_irq(&conf->device_lock);
2557 return -EAGAIN;
2558 }
2559 md_wakeup_thread(mddev->sync_thread);
2560 md_new_event(mddev);
2561 return 0;
2562}
2563#endif
2564
2565static void end_reshape(raid5_conf_t *conf)
2566{
2567 struct block_device *bdev;
2568
2569 conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
2570 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
2571 conf->mddev->changed = 1;
2572
2573 bdev = bdget_disk(conf->mddev->gendisk, 0);
2574 if (bdev) {
2575 mutex_lock(&bdev->bd_inode->i_mutex);
2576 i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
2577 mutex_unlock(&bdev->bd_inode->i_mutex);
2578 bdput(bdev);
2579 }
2580 spin_lock_irq(&conf->device_lock);
2581 conf->expand_progress = MaxSector;
2582 spin_unlock_irq(&conf->device_lock);
2583}
2584
2467static void raid5_quiesce(mddev_t *mddev, int state) 2585static void raid5_quiesce(mddev_t *mddev, int state)
2468{ 2586{
2469 raid5_conf_t *conf = mddev_to_conf(mddev); 2587 raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality =
2502 .spare_active = raid5_spare_active, 2620 .spare_active = raid5_spare_active,
2503 .sync_request = sync_request, 2621 .sync_request = sync_request,
2504 .resize = raid5_resize, 2622 .resize = raid5_resize,
2623#ifdef CONFIG_MD_RAID5_RESHAPE
2624 .reshape = raid5_reshape,
2625#endif
2505 .quiesce = raid5_quiesce, 2626 .quiesce = raid5_quiesce,
2506}; 2627};
2507 2628
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index b6e0bcad84e1..9c77cde5a795 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
92extern void md_super_wait(mddev_t *mddev); 92extern void md_super_wait(mddev_t *mddev);
93extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, 93extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
94 struct page *page, int rw); 94 struct page *page, int rw);
95 95extern void md_do_sync(mddev_t *mddev);
96extern void md_new_event(mddev_t *mddev);
96 97
97#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 98#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
98 99