diff options
| author | NeilBrown <neilb@suse.de> | 2006-03-27 04:18:10 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-03-27 11:45:01 -0500 |
| commit | 292695531ae4019bb15deedc121b218d1908b648 (patch) | |
| tree | fb205eae13c3f3410f6ea44557b1c96b075a4d44 | |
| parent | ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e (diff) | |
[PATCH] md: Final stages of raid5 expand code
This patch adds raid5_reshape and end_reshape which will start and finish the
reshape processes.
raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage
accidental use.
Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry.
and Make sure that you have backups, just in case.
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
| -rw-r--r-- | drivers/md/Kconfig | 26 | ||||
| -rw-r--r-- | drivers/md/md.c | 6 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 123 | ||||
| -rw-r--r-- | include/linux/raid/md.h | 3 |
4 files changed, 154 insertions, 4 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index ac43f98062fd..fd2aae150ccc 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
| @@ -127,6 +127,32 @@ config MD_RAID5 | |||
| 127 | 127 | ||
| 128 | If unsure, say Y. | 128 | If unsure, say Y. |
| 129 | 129 | ||
| 130 | config MD_RAID5_RESHAPE | ||
| 131 | bool "Support adding drives to a raid-5 array (experimental)" | ||
| 132 | depends on MD_RAID5 && EXPERIMENTAL | ||
| 133 | ---help--- | ||
| 134 | A RAID-5 set can be expanded by adding extra drives. This | ||
| 135 | requires "restriping" the array which means (almost) every | ||
| 136 | block must be written to a different place. | ||
| 137 | |||
| 138 | This option allows such restriping to be done while the array | ||
| 139 | is online. However it is still EXPERIMENTAL code. It should | ||
| 140 | work, but please be sure that you have backups. | ||
| 141 | |||
| 142 | You will need a version of mdadm newer than 2.3.1. During the | ||
| 143 | early stage of reshape there is a critical section where live data | ||
| 144 | is being over-written. A crash during this time needs extra care | ||
| 145 | for recovery. The newer mdadm takes a copy of the data in the | ||
| 146 | critical section and will restore it, if necessary, after a crash. | ||
| 147 | |||
| 148 | The mdadm usage is e.g. | ||
| 149 | mdadm --grow /dev/md1 --raid-disks=6 | ||
| 150 | to grow '/dev/md1' to having 6 disks. | ||
| 151 | |||
| 152 | Note: The array can only be expanded, not contracted. | ||
| 153 | There should be enough spares already present to make the new | ||
| 154 | array workable. | ||
| 155 | |||
| 130 | config MD_RAID6 | 156 | config MD_RAID6 |
| 131 | tristate "RAID-6 mode" | 157 | tristate "RAID-6 mode" |
| 132 | depends on BLK_DEV_MD | 158 | depends on BLK_DEV_MD |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e65986bc63f..d169bc964676 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -158,11 +158,12 @@ static int start_readonly; | |||
| 158 | */ | 158 | */ |
| 159 | static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); | 159 | static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); |
| 160 | static atomic_t md_event_count; | 160 | static atomic_t md_event_count; |
| 161 | static void md_new_event(mddev_t *mddev) | 161 | void md_new_event(mddev_t *mddev) |
| 162 | { | 162 | { |
| 163 | atomic_inc(&md_event_count); | 163 | atomic_inc(&md_event_count); |
| 164 | wake_up(&md_event_waiters); | 164 | wake_up(&md_event_waiters); |
| 165 | } | 165 | } |
| 166 | EXPORT_SYMBOL_GPL(md_new_event); | ||
| 166 | 167 | ||
| 167 | /* | 168 | /* |
| 168 | * Enables to iterate over all existing md arrays | 169 | * Enables to iterate over all existing md arrays |
| @@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); | |||
| 4467 | 4468 | ||
| 4468 | #define SYNC_MARKS 10 | 4469 | #define SYNC_MARKS 10 |
| 4469 | #define SYNC_MARK_STEP (3*HZ) | 4470 | #define SYNC_MARK_STEP (3*HZ) |
| 4470 | static void md_do_sync(mddev_t *mddev) | 4471 | void md_do_sync(mddev_t *mddev) |
| 4471 | { | 4472 | { |
| 4472 | mddev_t *mddev2; | 4473 | mddev_t *mddev2; |
| 4473 | unsigned int currspeed = 0, | 4474 | unsigned int currspeed = 0, |
| @@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev) | |||
| 4704 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | 4705 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); |
| 4705 | md_wakeup_thread(mddev->thread); | 4706 | md_wakeup_thread(mddev->thread); |
| 4706 | } | 4707 | } |
| 4708 | EXPORT_SYMBOL_GPL(md_do_sync); | ||
| 4707 | 4709 | ||
| 4708 | 4710 | ||
| 4709 | /* | 4711 | /* |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 56cba8d3e398..b29135acb1d9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num) | |||
| 331 | } | 331 | } |
| 332 | return 0; | 332 | return 0; |
| 333 | } | 333 | } |
| 334 | |||
| 335 | #ifdef CONFIG_MD_RAID5_RESHAPE | ||
| 334 | static int resize_stripes(raid5_conf_t *conf, int newsize) | 336 | static int resize_stripes(raid5_conf_t *conf, int newsize) |
| 335 | { | 337 | { |
| 336 | /* Make all the stripes able to hold 'newsize' devices. | 338 | /* Make all the stripes able to hold 'newsize' devices. |
| @@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
| 451 | conf->pool_size = newsize; | 453 | conf->pool_size = newsize; |
| 452 | return err; | 454 | return err; |
| 453 | } | 455 | } |
| 454 | 456 | #endif | |
| 455 | 457 | ||
| 456 | static int drop_one_stripe(raid5_conf_t *conf) | 458 | static int drop_one_stripe(raid5_conf_t *conf) |
| 457 | { | 459 | { |
| @@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 1034 | return 0; | 1036 | return 0; |
| 1035 | } | 1037 | } |
| 1036 | 1038 | ||
| 1039 | static void end_reshape(raid5_conf_t *conf); | ||
| 1040 | |||
| 1037 | static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) | 1041 | static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) |
| 1038 | { | 1042 | { |
| 1039 | int sectors_per_chunk = conf->chunk_size >> 9; | 1043 | int sectors_per_chunk = conf->chunk_size >> 9; |
| @@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1844 | if (sector_nr >= max_sector) { | 1848 | if (sector_nr >= max_sector) { |
| 1845 | /* just being told to finish up .. nothing much to do */ | 1849 | /* just being told to finish up .. nothing much to do */ |
| 1846 | unplug_slaves(mddev); | 1850 | unplug_slaves(mddev); |
| 1851 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | ||
| 1852 | end_reshape(conf); | ||
| 1853 | return 0; | ||
| 1854 | } | ||
| 1847 | 1855 | ||
| 1848 | if (mddev->curr_resync < max_sector) /* aborted */ | 1856 | if (mddev->curr_resync < max_sector) /* aborted */ |
| 1849 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | 1857 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, |
| @@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
| 2464 | return 0; | 2472 | return 0; |
| 2465 | } | 2473 | } |
| 2466 | 2474 | ||
| 2475 | #ifdef CONFIG_MD_RAID5_RESHAPE | ||
| 2476 | static int raid5_reshape(mddev_t *mddev, int raid_disks) | ||
| 2477 | { | ||
| 2478 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
| 2479 | int err; | ||
| 2480 | mdk_rdev_t *rdev; | ||
| 2481 | struct list_head *rtmp; | ||
| 2482 | int spares = 0; | ||
| 2483 | int added_devices = 0; | ||
| 2484 | |||
| 2485 | if (mddev->degraded || | ||
| 2486 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | ||
| 2487 | return -EBUSY; | ||
| 2488 | if (conf->raid_disks > raid_disks) | ||
| 2489 | return -EINVAL; /* Cannot shrink array yet */ | ||
| 2490 | if (conf->raid_disks == raid_disks) | ||
| 2491 | return 0; /* nothing to do */ | ||
| 2492 | |||
| 2493 | /* Can only proceed if there are plenty of stripe_heads. | ||
| 2494 | * We need a minimum of one full stripe,, and for sensible progress | ||
| 2495 | * it is best to have about 4 times that. | ||
| 2496 | * If we require 4 times, then the default 256 4K stripe_heads will | ||
| 2497 | * allow for chunk sizes up to 256K, which is probably OK. | ||
| 2498 | * If the chunk size is greater, user-space should request more | ||
| 2499 | * stripe_heads first. | ||
| 2500 | */ | ||
| 2501 | if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { | ||
| 2502 | printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", | ||
| 2503 | (mddev->chunk_size / STRIPE_SIZE)*4); | ||
| 2504 | return -ENOSPC; | ||
| 2505 | } | ||
| 2506 | |||
| 2507 | ITERATE_RDEV(mddev, rdev, rtmp) | ||
| 2508 | if (rdev->raid_disk < 0 && | ||
| 2509 | !test_bit(Faulty, &rdev->flags)) | ||
| 2510 | spares++; | ||
| 2511 | if (conf->raid_disks + spares < raid_disks-1) | ||
| 2512 | /* Not enough devices even to make a degraded array | ||
| 2513 | * of that size | ||
| 2514 | */ | ||
| 2515 | return -EINVAL; | ||
| 2516 | |||
| 2517 | err = resize_stripes(conf, raid_disks); | ||
| 2518 | if (err) | ||
| 2519 | return err; | ||
| 2520 | |||
| 2521 | spin_lock_irq(&conf->device_lock); | ||
| 2522 | conf->previous_raid_disks = conf->raid_disks; | ||
| 2523 | mddev->raid_disks = conf->raid_disks = raid_disks; | ||
| 2524 | conf->expand_progress = 0; | ||
| 2525 | spin_unlock_irq(&conf->device_lock); | ||
| 2526 | |||
| 2527 | /* Add some new drives, as many as will fit. | ||
| 2528 | * We know there are enough to make the newly sized array work. | ||
| 2529 | */ | ||
| 2530 | ITERATE_RDEV(mddev, rdev, rtmp) | ||
| 2531 | if (rdev->raid_disk < 0 && | ||
| 2532 | !test_bit(Faulty, &rdev->flags)) { | ||
| 2533 | if (raid5_add_disk(mddev, rdev)) { | ||
| 2534 | char nm[20]; | ||
| 2535 | set_bit(In_sync, &rdev->flags); | ||
| 2536 | conf->working_disks++; | ||
| 2537 | added_devices++; | ||
| 2538 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 2539 | sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); | ||
| 2540 | } else | ||
| 2541 | break; | ||
| 2542 | } | ||
| 2543 | |||
| 2544 | mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices; | ||
| 2545 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
| 2546 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
| 2547 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
| 2548 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
| 2549 | mddev->sync_thread = md_register_thread(md_do_sync, mddev, | ||
| 2550 | "%s_reshape"); | ||
| 2551 | if (!mddev->sync_thread) { | ||
| 2552 | mddev->recovery = 0; | ||
| 2553 | spin_lock_irq(&conf->device_lock); | ||
| 2554 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | ||
| 2555 | conf->expand_progress = MaxSector; | ||
| 2556 | spin_unlock_irq(&conf->device_lock); | ||
| 2557 | return -EAGAIN; | ||
| 2558 | } | ||
| 2559 | md_wakeup_thread(mddev->sync_thread); | ||
| 2560 | md_new_event(mddev); | ||
| 2561 | return 0; | ||
| 2562 | } | ||
| 2563 | #endif | ||
| 2564 | |||
| 2565 | static void end_reshape(raid5_conf_t *conf) | ||
| 2566 | { | ||
| 2567 | struct block_device *bdev; | ||
| 2568 | |||
| 2569 | conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1); | ||
| 2570 | set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); | ||
| 2571 | conf->mddev->changed = 1; | ||
| 2572 | |||
| 2573 | bdev = bdget_disk(conf->mddev->gendisk, 0); | ||
| 2574 | if (bdev) { | ||
| 2575 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
| 2576 | i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); | ||
| 2577 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
| 2578 | bdput(bdev); | ||
| 2579 | } | ||
| 2580 | spin_lock_irq(&conf->device_lock); | ||
| 2581 | conf->expand_progress = MaxSector; | ||
| 2582 | spin_unlock_irq(&conf->device_lock); | ||
| 2583 | } | ||
| 2584 | |||
| 2467 | static void raid5_quiesce(mddev_t *mddev, int state) | 2585 | static void raid5_quiesce(mddev_t *mddev, int state) |
| 2468 | { | 2586 | { |
| 2469 | raid5_conf_t *conf = mddev_to_conf(mddev); | 2587 | raid5_conf_t *conf = mddev_to_conf(mddev); |
| @@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality = | |||
| 2502 | .spare_active = raid5_spare_active, | 2620 | .spare_active = raid5_spare_active, |
| 2503 | .sync_request = sync_request, | 2621 | .sync_request = sync_request, |
| 2504 | .resize = raid5_resize, | 2622 | .resize = raid5_resize, |
| 2623 | #ifdef CONFIG_MD_RAID5_RESHAPE | ||
| 2624 | .reshape = raid5_reshape, | ||
| 2625 | #endif | ||
| 2505 | .quiesce = raid5_quiesce, | 2626 | .quiesce = raid5_quiesce, |
| 2506 | }; | 2627 | }; |
| 2507 | 2628 | ||
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b6e0bcad84e1..9c77cde5a795 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h | |||
| @@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | |||
| 92 | extern void md_super_wait(mddev_t *mddev); | 92 | extern void md_super_wait(mddev_t *mddev); |
| 93 | extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, | 93 | extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, |
| 94 | struct page *page, int rw); | 94 | struct page *page, int rw); |
| 95 | 95 | extern void md_do_sync(mddev_t *mddev); | |
| 96 | extern void md_new_event(mddev_t *mddev); | ||
| 96 | 97 | ||
| 97 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } | 98 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } |
| 98 | 99 | ||
