From b55e6bfcd23cb2f7249095050c649f7aea813f9f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:06 -0800 Subject: [PATCH] md: Split disks array out of raid5 conf structure so it is easier to grow The remainder of this batch implements raid5 reshaping. Currently the only shape change that is supported is added a device, but it is envisioned that changing the chunksize and layout will also be supported, as well as changing the level (e.g. 1->5, 5->6). The reshape process naturally has to move all of the data in the array, and so should be used with caution. It is believed to work, and some testing does support this, but wider testing would be great for increasing my confidence. You will need a version of mdadm newer than 2.3.1 to make use of raid5 growth. This is because mdadm need to take a copy of a 'critical section' at the start of the array incase there is a crash at an awkward moment. On restart, mdadm will restore the critical section and allow reshape to continue. I hope to release a 2.4-pre by early next week - it still needs a little more polishing. This patch: Previously the array of disk information was included in the raid5 'conf' structure which was allocated to an appropriate size. This makes it awkward to change the size of that array. So we split it off into a separate kmalloced array which will require a little extra indexing, but is much easier to grow. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/raid/raid5.h') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 394da8207b34..94dbdd406f12 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -240,7 +240,7 @@ struct raid5_private_data { * waiting for 25% to be free */ spinlock_t device_lock; - struct disk_info disks[0]; + struct disk_info *disks; }; typedef struct raid5_private_data raid5_conf_t; -- cgit v1.2.2 From ad01c9e3752f4ba4f3d99c89b7370fa4983a25b5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:07 -0800 Subject: [PATCH] md: Allow stripes to be expanded in preparation for expanding an array Before a RAID-5 can be expanded, we need to be able to expand the stripe-cache data structure. This requires allocating new stripes in a new kmem_cache. If this succeeds, we copy cache pages over and release the old stripes and kmem_cache. We then allocate new pages. If that fails, we leave the stripe cache at it's new size. It isn't worth the effort to shrink it back again. Unfortuanately this means we need two kmem_cache names as we, for a short period of time, we have two kmem_caches. So they are raid5/%s and raid5/%s-alt Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux/raid/raid5.h') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 94dbdd406f12..b7b2653af7bb 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -216,7 +216,11 @@ struct raid5_private_data { struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ atomic_t preread_active_stripes; /* stripes with scheduled io */ - char cache_name[20]; + /* unfortunately we need two cache names as we temporarily have + * two caches. + */ + int active_name; + char cache_name[2][20]; kmem_cache_t *slab_cache; /* for allocating stripes */ int seq_flush, seq_write; @@ -238,7 +242,8 @@ struct raid5_private_data { wait_queue_head_t wait_for_overlap; int inactive_blocked; /* release of inactive stripes blocked, * waiting for 25% to be free - */ + */ + int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; }; -- cgit v1.2.2 From 7ecaa1e6a1ad69862e9980b6c777e11f26c4782d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:08 -0800 Subject: [PATCH] md: Infrastructure to allow normal IO to continue while array is expanding We need to allow that different stripes are of different effective sizes, and use the appropriate size. Also, when a stripe is being expanded, we must block any IO attempts until the stripe is stable again. Key elements in this change are: - each stripe_head gets a 'disk' field which is part of the key, thus there can sometimes be two stripe heads of the same area of the array, but covering different numbers of devices. One of these will be marked STRIPE_EXPANDING and so won't accept new requests. - conf->expand_progress tracks how the expansion is progressing and is used to determine whether the target part of the array has been expanded yet or not. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/raid/raid5.h') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index b7b2653af7bb..6fa274aea2a0 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -135,6 +135,7 @@ struct stripe_head { atomic_t count; /* nr of active thread/requests */ spinlock_t lock; int bm_seq; /* sequence number for bitmap flushes */ + int disks; /* disks in stripe */ struct r5dev { struct bio req; struct bio_vec vec; @@ -174,6 +175,7 @@ struct stripe_head { #define STRIPE_DELAYED 6 #define STRIPE_DEGRADED 7 #define STRIPE_BIT_DELAY 8 +#define STRIPE_EXPANDING 9 /* * Plugging: @@ -211,6 +213,10 @@ struct raid5_private_data { int raid_disks, working_disks, failed_disks; int max_nr_stripes; + /* used during an expand */ + sector_t expand_progress; /* MaxSector when no expand happening */ + int previous_raid_disks; + struct list_head handle_list; /* stripes needing handling */ struct list_head delayed_list; /* stripes that have plugged requests */ struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ -- cgit v1.2.2 From ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:09 -0800 Subject: [PATCH] md: Core of raid5 resize process This patch provides the core of the resize/expand process. sync_request notices if a 'reshape' is happening and acts accordingly. It allocated new stripe_heads for the next chunk-wide-stripe in the target geometry, marking them STRIPE_EXPANDING. Then it finds which stripe heads in the old geometry can provide data needed by these and marks them STRIPE_EXPAND_SOURCE. This causes stripe_handle to read all blocks on those stripes. Once all blocks on a STRIPE_EXPAND_SOURCE stripe_head are read, any that are needed are copied into the corresponding STRIPE_EXPANDING stripe_head. Once a STRIPE_EXPANDING stripe_head is full, it is marks STRIPE_EXPAND_READY and then is written out and released. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/raid/raid5.h') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 6fa274aea2a0..55c738d50508 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -157,6 +157,7 @@ struct stripe_head { #define R5_ReadError 8 /* seen a read error here recently */ #define R5_ReWrite 9 /* have tried to over-write the readerror */ +#define R5_Expanded 10 /* This block now has post-expand data */ /* * Write method */ @@ -176,7 +177,8 @@ struct stripe_head { #define STRIPE_DEGRADED 7 #define STRIPE_BIT_DELAY 8 #define STRIPE_EXPANDING 9 - +#define STRIPE_EXPAND_SOURCE 10 +#define STRIPE_EXPAND_READY 11 /* * Plugging: * -- cgit v1.2.2 From f67055780caac6a99f43834795c43acf99eba6a6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:11 -0800 Subject: [PATCH] md: Checkpoint and allow restart of raid5 reshape We allow the superblock to record an 'old' and a 'new' geometry, and a position where any conversion is up to. The geometry allows for changing chunksize, layout and level as well as number of devices. When using verion-0.90 superblock, we convert the version to 0.91 while the conversion is happening so that an old kernel will refuse the assemble the array. For version-1, we use a feature bit for the same effect. When starting an array we check for an incomplete reshape and restart the reshape process if needed. If the reshape stopped at an awkward time (like when updating the first stripe) we refuse to assemble the array, and let user-space worry about it. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/raid/raid5.h') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 55c738d50508..abcdf0d0658a 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -224,6 +224,7 @@ struct raid5_private_data { struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ /* unfortunately we need two cache names as we temporarily have * two caches. */ -- cgit v1.2.2 From b578d55fdd80140f657130abd85aebeb345755fb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:12 -0800 Subject: [PATCH] md: Only checkpoint expansion progress occasionally Instead of checkpointing at each stripe, only checkpoint when a new write would overwrite uncheckpointed data. Block any write to the uncheckpointed area. Arbitrarily checkpoint at least every 3Meg. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/raid/raid5.h') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index abcdf0d0658a..914af667044f 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -217,6 +217,9 @@ struct raid5_private_data { /* used during an expand */ sector_t expand_progress; /* MaxSector when no expand happening */ + sector_t expand_lo; /* from here up to expand_progress it out-of-bounds + * as we haven't flushed the metadata yet + */ int previous_raid_disks; struct list_head handle_list; /* stripes needing handling */ -- cgit v1.2.2