diff options
author | NeilBrown <neilb@suse.de> | 2006-03-27 04:18:10 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-03-27 11:45:01 -0500 |
commit | 292695531ae4019bb15deedc121b218d1908b648 (patch) | |
tree | fb205eae13c3f3410f6ea44557b1c96b075a4d44 | |
parent | ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e (diff) |
[PATCH] md: Final stages of raid5 expand code
This patch adds raid5_reshape and end_reshape which will start and finish the
reshape processes.
raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage
accidental use.
Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry.
and Make sure that you have backups, just in case.
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | drivers/md/Kconfig | 26 | ||||
-rw-r--r-- | drivers/md/md.c | 6 | ||||
-rw-r--r-- | drivers/md/raid5.c | 123 | ||||
-rw-r--r-- | include/linux/raid/md.h | 3 |
4 files changed, 154 insertions, 4 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index ac43f98062fd..fd2aae150ccc 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -127,6 +127,32 @@ config MD_RAID5 | |||
127 | 127 | ||
128 | If unsure, say Y. | 128 | If unsure, say Y. |
129 | 129 | ||
130 | config MD_RAID5_RESHAPE | ||
131 | bool "Support adding drives to a raid-5 array (experimental)" | ||
132 | depends on MD_RAID5 && EXPERIMENTAL | ||
133 | ---help--- | ||
134 | A RAID-5 set can be expanded by adding extra drives. This | ||
135 | requires "restriping" the array which means (almost) every | ||
136 | block must be written to a different place. | ||
137 | |||
138 | This option allows such restriping to be done while the array | ||
139 | is online. However it is still EXPERIMENTAL code. It should | ||
140 | work, but please be sure that you have backups. | ||
141 | |||
142 | You will need a version of mdadm newer than 2.3.1. During the | ||
143 | early stage of reshape there is a critical section where live data | ||
144 | is being over-written. A crash during this time needs extra care | ||
145 | for recovery. The newer mdadm takes a copy of the data in the | ||
146 | critical section and will restore it, if necessary, after a crash. | ||
147 | |||
148 | The mdadm usage is e.g. | ||
149 | mdadm --grow /dev/md1 --raid-disks=6 | ||
150 | to grow '/dev/md1' to having 6 disks. | ||
151 | |||
152 | Note: The array can only be expanded, not contracted. | ||
153 | There should be enough spares already present to make the new | ||
154 | array workable. | ||
155 | |||
130 | config MD_RAID6 | 156 | config MD_RAID6 |
131 | tristate "RAID-6 mode" | 157 | tristate "RAID-6 mode" |
132 | depends on BLK_DEV_MD | 158 | depends on BLK_DEV_MD |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e65986bc63f..d169bc964676 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -158,11 +158,12 @@ static int start_readonly; | |||
158 | */ | 158 | */ |
159 | static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); | 159 | static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); |
160 | static atomic_t md_event_count; | 160 | static atomic_t md_event_count; |
161 | static void md_new_event(mddev_t *mddev) | 161 | void md_new_event(mddev_t *mddev) |
162 | { | 162 | { |
163 | atomic_inc(&md_event_count); | 163 | atomic_inc(&md_event_count); |
164 | wake_up(&md_event_waiters); | 164 | wake_up(&md_event_waiters); |
165 | } | 165 | } |
166 | EXPORT_SYMBOL_GPL(md_new_event); | ||
166 | 167 | ||
167 | /* | 168 | /* |
168 | * Enables to iterate over all existing md arrays | 169 | * Enables to iterate over all existing md arrays |
@@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); | |||
4467 | 4468 | ||
4468 | #define SYNC_MARKS 10 | 4469 | #define SYNC_MARKS 10 |
4469 | #define SYNC_MARK_STEP (3*HZ) | 4470 | #define SYNC_MARK_STEP (3*HZ) |
4470 | static void md_do_sync(mddev_t *mddev) | 4471 | void md_do_sync(mddev_t *mddev) |
4471 | { | 4472 | { |
4472 | mddev_t *mddev2; | 4473 | mddev_t *mddev2; |
4473 | unsigned int currspeed = 0, | 4474 | unsigned int currspeed = 0, |
@@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev) | |||
4704 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | 4705 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); |
4705 | md_wakeup_thread(mddev->thread); | 4706 | md_wakeup_thread(mddev->thread); |
4706 | } | 4707 | } |
4708 | EXPORT_SYMBOL_GPL(md_do_sync); | ||
4707 | 4709 | ||
4708 | 4710 | ||
4709 | /* | 4711 | /* |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 56cba8d3e398..b29135acb1d9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num) | |||
331 | } | 331 | } |
332 | return 0; | 332 | return 0; |
333 | } | 333 | } |
334 | |||
335 | #ifdef CONFIG_MD_RAID5_RESHAPE | ||
334 | static int resize_stripes(raid5_conf_t *conf, int newsize) | 336 | static int resize_stripes(raid5_conf_t *conf, int newsize) |
335 | { | 337 | { |
336 | /* Make all the stripes able to hold 'newsize' devices. | 338 | /* Make all the stripes able to hold 'newsize' devices. |
@@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
451 | conf->pool_size = newsize; | 453 | conf->pool_size = newsize; |
452 | return err; | 454 | return err; |
453 | } | 455 | } |
454 | 456 | #endif | |
455 | 457 | ||
456 | static int drop_one_stripe(raid5_conf_t *conf) | 458 | static int drop_one_stripe(raid5_conf_t *conf) |
457 | { | 459 | { |
@@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
1034 | return 0; | 1036 | return 0; |
1035 | } | 1037 | } |
1036 | 1038 | ||
1039 | static void end_reshape(raid5_conf_t *conf); | ||
1040 | |||
1037 | static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) | 1041 | static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) |
1038 | { | 1042 | { |
1039 | int sectors_per_chunk = conf->chunk_size >> 9; | 1043 | int sectors_per_chunk = conf->chunk_size >> 9; |
@@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1844 | if (sector_nr >= max_sector) { | 1848 | if (sector_nr >= max_sector) { |
1845 | /* just being told to finish up .. nothing much to do */ | 1849 | /* just being told to finish up .. nothing much to do */ |
1846 | unplug_slaves(mddev); | 1850 | unplug_slaves(mddev); |
1851 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | ||
1852 | end_reshape(conf); | ||
1853 | return 0; | ||
1854 | } | ||
1847 | 1855 | ||
1848 | if (mddev->curr_resync < max_sector) /* aborted */ | 1856 | if (mddev->curr_resync < max_sector) /* aborted */ |
1849 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | 1857 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, |
@@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
2464 | return 0; | 2472 | return 0; |
2465 | } | 2473 | } |
2466 | 2474 | ||
2475 | #ifdef CONFIG_MD_RAID5_RESHAPE | ||
2476 | static int raid5_reshape(mddev_t *mddev, int raid_disks) | ||
2477 | { | ||
2478 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
2479 | int err; | ||
2480 | mdk_rdev_t *rdev; | ||
2481 | struct list_head *rtmp; | ||
2482 | int spares = 0; | ||
2483 | int added_devices = 0; | ||
2484 | |||
2485 | if (mddev->degraded || | ||
2486 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | ||
2487 | return -EBUSY; | ||
2488 | if (conf->raid_disks > raid_disks) | ||
2489 | return -EINVAL; /* Cannot shrink array yet */ | ||
2490 | if (conf->raid_disks == raid_disks) | ||
2491 | return 0; /* nothing to do */ | ||
2492 | |||
2493 | /* Can only proceed if there are plenty of stripe_heads. | ||
2494 | * We need a minimum of one full stripe,, and for sensible progress | ||
2495 | * it is best to have about 4 times that. | ||
2496 | * If we require 4 times, then the default 256 4K stripe_heads will | ||
2497 | * allow for chunk sizes up to 256K, which is probably OK. | ||
2498 | * If the chunk size is greater, user-space should request more | ||
2499 | * stripe_heads first. | ||
2500 | */ | ||
2501 | if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { | ||
2502 | printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", | ||
2503 | (mddev->chunk_size / STRIPE_SIZE)*4); | ||
2504 | return -ENOSPC; | ||
2505 | } | ||
2506 | |||
2507 | ITERATE_RDEV(mddev, rdev, rtmp) | ||
2508 | if (rdev->raid_disk < 0 && | ||
2509 | !test_bit(Faulty, &rdev->flags)) | ||
2510 | spares++; | ||
2511 | if (conf->raid_disks + spares < raid_disks-1) | ||
2512 | /* Not enough devices even to make a degraded array | ||
2513 | * of that size | ||
2514 | */ | ||
2515 | return -EINVAL; | ||
2516 | |||
2517 | err = resize_stripes(conf, raid_disks); | ||
2518 | if (err) | ||
2519 | return err; | ||
2520 | |||
2521 | spin_lock_irq(&conf->device_lock); | ||
2522 | conf->previous_raid_disks = conf->raid_disks; | ||
2523 | mddev->raid_disks = conf->raid_disks = raid_disks; | ||
2524 | conf->expand_progress = 0; | ||
2525 | spin_unlock_irq(&conf->device_lock); | ||
2526 | |||
2527 | /* Add some new drives, as many as will fit. | ||
2528 | * We know there are enough to make the newly sized array work. | ||
2529 | */ | ||
2530 | ITERATE_RDEV(mddev, rdev, rtmp) | ||
2531 | if (rdev->raid_disk < 0 && | ||
2532 | !test_bit(Faulty, &rdev->flags)) { | ||
2533 | if (raid5_add_disk(mddev, rdev)) { | ||
2534 | char nm[20]; | ||
2535 | set_bit(In_sync, &rdev->flags); | ||
2536 | conf->working_disks++; | ||
2537 | added_devices++; | ||
2538 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
2539 | sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); | ||
2540 | } else | ||
2541 | break; | ||
2542 | } | ||
2543 | |||
2544 | mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices; | ||
2545 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
2546 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
2547 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
2548 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
2549 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
2550 | "%s_reshape"); | ||
2551 | if (!mddev->sync_thread) { | ||
2552 | mddev->recovery = 0; | ||
2553 | spin_lock_irq(&conf->device_lock); | ||
2554 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | ||
2555 | conf->expand_progress = MaxSector; | ||
2556 | spin_unlock_irq(&conf->device_lock); | ||
2557 | return -EAGAIN; | ||
2558 | } | ||
2559 | md_wakeup_thread(mddev->sync_thread); | ||
2560 | md_new_event(mddev); | ||
2561 | return 0; | ||
2562 | } | ||
2563 | #endif | ||
2564 | |||
2565 | static void end_reshape(raid5_conf_t *conf) | ||
2566 | { | ||
2567 | struct block_device *bdev; | ||
2568 | |||
2569 | conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1); | ||
2570 | set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); | ||
2571 | conf->mddev->changed = 1; | ||
2572 | |||
2573 | bdev = bdget_disk(conf->mddev->gendisk, 0); | ||
2574 | if (bdev) { | ||
2575 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
2576 | i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); | ||
2577 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
2578 | bdput(bdev); | ||
2579 | } | ||
2580 | spin_lock_irq(&conf->device_lock); | ||
2581 | conf->expand_progress = MaxSector; | ||
2582 | spin_unlock_irq(&conf->device_lock); | ||
2583 | } | ||
2584 | |||
2467 | static void raid5_quiesce(mddev_t *mddev, int state) | 2585 | static void raid5_quiesce(mddev_t *mddev, int state) |
2468 | { | 2586 | { |
2469 | raid5_conf_t *conf = mddev_to_conf(mddev); | 2587 | raid5_conf_t *conf = mddev_to_conf(mddev); |
@@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality = | |||
2502 | .spare_active = raid5_spare_active, | 2620 | .spare_active = raid5_spare_active, |
2503 | .sync_request = sync_request, | 2621 | .sync_request = sync_request, |
2504 | .resize = raid5_resize, | 2622 | .resize = raid5_resize, |
2623 | #ifdef CONFIG_MD_RAID5_RESHAPE | ||
2624 | .reshape = raid5_reshape, | ||
2625 | #endif | ||
2505 | .quiesce = raid5_quiesce, | 2626 | .quiesce = raid5_quiesce, |
2506 | }; | 2627 | }; |
2507 | 2628 | ||
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b6e0bcad84e1..9c77cde5a795 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h | |||
@@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | |||
92 | extern void md_super_wait(mddev_t *mddev); | 92 | extern void md_super_wait(mddev_t *mddev); |
93 | extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, | 93 | extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, |
94 | struct page *page, int rw); | 94 | struct page *page, int rw); |
95 | 95 | extern void md_do_sync(mddev_t *mddev); | |
96 | extern void md_new_event(mddev_t *mddev); | ||
96 | 97 | ||
97 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } | 98 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } |
98 | 99 | ||