From 1442577bf6da1a31ae6555212202be9a2cec5642 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 16 Oct 2009 15:55:25 +1100 Subject: Revert "md: do not progress the resync process if the stripe was blocked" This reverts commit df10cfbc4d7ab93260d997df754219d390d62a9d. This patch was based on a misunderstanding and risks introducing a busy-wait loop. So revert it. Acked-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/raid5.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 94829804ab7f..ec14ff5b508a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2896,7 +2896,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, * */ -static bool handle_stripe5(struct stripe_head *sh) +static void handle_stripe5(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; @@ -3167,11 +3167,9 @@ static bool handle_stripe5(struct stripe_head *sh) ops_run_io(sh, &s); return_io(return_bi); - - return blocked_rdev == NULL; } -static bool handle_stripe6(struct stripe_head *sh) +static void handle_stripe6(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; @@ -3455,17 +3453,14 @@ static bool handle_stripe6(struct stripe_head *sh) ops_run_io(sh, &s); return_io(return_bi); - - return blocked_rdev == NULL; } -/* returns true if the stripe was handled */ -static bool handle_stripe(struct stripe_head *sh) +static void handle_stripe(struct stripe_head *sh) { if (sh->raid_conf->level == 6) - return handle_stripe6(sh); + handle_stripe6(sh); else - return handle_stripe5(sh); + handle_stripe5(sh); } static void raid5_activate_delayed(raid5_conf_t *conf) @@ -4277,9 +4272,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski clear_bit(STRIPE_INSYNC, &sh->state); spin_unlock(&sh->lock); - /* wait for any blocked device to be handled */ - while (unlikely(!handle_stripe(sh))) - ; + handle_stripe(sh); release_stripe(sh); return STRIPE_SECTORS; -- cgit v1.2.2 From 1d9d52416c0445019ccc1f0fddb9a227456eb61b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 16 Oct 2009 15:55:32 +1100 Subject: md/raid1/raid10: add a cond_resched During 'check' of a raid1 or raid10 it is possible for the management thread to spend a lot of time running 'memcmp' on blocks from different devices, so make sure the thread has a chance to schedule. raid5d already has a cond_resched (in process_stripe). Reported-By: Lee Howard Signed-off-by: NeilBrown --- drivers/md/raid1.c | 1 + drivers/md/raid10.c | 1 + 2 files changed, 2 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d1b9bd5fd4f6..71a01a2e1938 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1683,6 +1683,7 @@ static void raid1d(mddev_t *mddev) generic_make_request(bio); } } + cond_resched(); } if (unplug) unplug_slaves(mddev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 51c4c5c4d87a..69fc76caa469 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1632,6 +1632,7 @@ static void raid10d(mddev_t *mddev) generic_make_request(bio); } } + cond_resched(); } if (unplug) unplug_slaves(mddev); -- cgit v1.2.2 From f5efd45ae597c96ed017afad5662b67d55b402a0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 16 Oct 2009 15:55:38 +1100 Subject: md/raid5: initialize conf->device_lock earlier Deallocating a raid5_conf_t structure requires taking 'device_lock'. Ensure it is initialized before it is used, i.e. initialize the lock before attempting any further initializations that might fail. Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/raid5.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ec14ff5b508a..c3e596778618 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4715,6 +4715,18 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); if (conf == NULL) goto abort; + spin_lock_init(&conf->device_lock); + init_waitqueue_head(&conf->wait_for_stripe); + init_waitqueue_head(&conf->wait_for_overlap); + INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->hold_list); + INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); + INIT_LIST_HEAD(&conf->inactive_list); + atomic_set(&conf->active_stripes, 0); + atomic_set(&conf->preread_active_stripes, 0); + atomic_set(&conf->active_aligned_reads, 0); + conf->bypass_threshold = BYPASS_THRESHOLD; conf->raid_disks = mddev->raid_disks; conf->scribble_len = scribble_len(conf->raid_disks); @@ -4737,19 +4749,6 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if (raid5_alloc_percpu(conf) != 0) goto abort; - spin_lock_init(&conf->device_lock); - init_waitqueue_head(&conf->wait_for_stripe); - init_waitqueue_head(&conf->wait_for_overlap); - INIT_LIST_HEAD(&conf->handle_list); - INIT_LIST_HEAD(&conf->hold_list); - INIT_LIST_HEAD(&conf->delayed_list); - INIT_LIST_HEAD(&conf->bitmap_list); - INIT_LIST_HEAD(&conf->inactive_list); - atomic_set(&conf->active_stripes, 0); - atomic_set(&conf->preread_active_stripes, 0); - atomic_set(&conf->active_aligned_reads, 0); - conf->bypass_threshold = BYPASS_THRESHOLD; - pr_debug("raid5: run(%s) called.\n", mdname(mddev)); list_for_each_entry(rdev, &mddev->disks, same_set) { -- cgit v1.2.2 From ed9bfdf1a40952fd0f8094ec77f876b84ead69af Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 16 Oct 2009 15:55:44 +1100 Subject: md: raid1/raid10: handle allocation errors during array setup. Both raid1 and raid10 create a mempool during startup. If the 'alloc' function for this mempool fails, unplug_slaves is called. If that happens when the pool is being initialised, unplug_slaves will try to use the 'conf' structure that isn't filled in yet, and badness will happen. So ensure that unplug_slaves doesn't get called unless we know that the conf structure if fully initialised. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 5 +++-- drivers/md/raid10.c | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 71a01a2e1938..a053423785c9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -64,7 +64,7 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) /* allocate a r1bio with room for raid_disks entries in the bios array */ r1_bio = kzalloc(size, gfp_flags); - if (!r1_bio) + if (!r1_bio && pi->mddev) unplug_slaves(pi->mddev); return r1_bio; @@ -1979,13 +1979,14 @@ static int run(mddev_t *mddev) conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); if (!conf->poolinfo) goto out_no_mem; - conf->poolinfo->mddev = mddev; + conf->poolinfo->mddev = NULL; conf->poolinfo->raid_disks = mddev->raid_disks; conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, r1bio_pool_free, conf->poolinfo); if (!conf->r1bio_pool) goto out_no_mem; + conf->poolinfo->mddev = mddev; spin_lock_init(&conf->device_lock); mddev->queue->queue_lock = &conf->device_lock; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 69fc76caa469..c2cb7b87b440 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -68,7 +68,7 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) /* allocate a r10bio with room for raid_disks entries in the bios array */ r10_bio = kzalloc(size, gfp_flags); - if (!r10_bio) + if (!r10_bio && conf->mddev) unplug_slaves(conf->mddev); return r10_bio; @@ -2096,7 +2096,6 @@ static int run(mddev_t *mddev) if (!conf->tmppage) goto out_free_conf; - conf->mddev = mddev; conf->raid_disks = mddev->raid_disks; conf->near_copies = nc; conf->far_copies = fc; @@ -2133,6 +2132,7 @@ static int run(mddev_t *mddev) goto out_free_conf; } + conf->mddev = mddev; spin_lock_init(&conf->device_lock); mddev->queue->queue_lock = &conf->device_lock; -- cgit v1.2.2 From ae8fa2831bbc5092ee9d1b90e623af881cf27140 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 16 Oct 2009 15:56:01 +1100 Subject: md: remove clumsy usage of do_sync_mapping_range from bitmap code and replace with vfs_fsync which is much neater (but wasn't exported, or even in existence at the time the code was written). Cc: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6986b0059d23..60e2b322db11 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1624,10 +1624,11 @@ int bitmap_create(mddev_t *mddev) bitmap->offset = mddev->bitmap_offset; if (file) { get_file(file); - do_sync_mapping_range(file->f_mapping, 0, LLONG_MAX, - SYNC_FILE_RANGE_WAIT_BEFORE | - SYNC_FILE_RANGE_WRITE | - SYNC_FILE_RANGE_WAIT_AFTER); + /* As future accesses to this file will use bmap, + * and bypass the page cache, we must sync the file + * first. + */ + vfs_fsync(file, file->f_dentry, 1); } /* read superblock from bitmap file (this sets bitmap->chunksize) */ err = bitmap_read_sb(bitmap); -- cgit v1.2.2 From dce3a7a42d585b74ce68081010b42afe81c8f4c4 Mon Sep 17 00:00:00 2001 From: Vladimir Dronnikov Date: Fri, 16 Oct 2009 16:25:19 +1100 Subject: md: drivers/md/unroll.pl replaced with awk analog drivers/md/unroll.pl replaced by awk script to drop build-time dependency on perl Signed-off-by: Vladimir Dronnikov Signed-off-by: NeilBrown --- drivers/md/Makefile | 22 +++++++++++----------- drivers/md/raid6altivec.uc | 2 +- drivers/md/raid6int.uc | 2 +- drivers/md/raid6test/Makefile | 42 +++++++++++++++++++++--------------------- drivers/md/unroll.awk | 20 ++++++++++++++++++++ drivers/md/unroll.pl | 24 ------------------------ 6 files changed, 54 insertions(+), 58 deletions(-) create mode 100644 drivers/md/unroll.awk delete mode 100644 drivers/md/unroll.pl (limited to 'drivers/md') diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 1dc4185bd781..e355e7f6a536 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -46,7 +46,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o quiet_cmd_unroll = UNROLL $@ - cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ + cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \ < $< > $@ || ( rm -f $@ && exit 1 ) ifeq ($(CONFIG_ALTIVEC),y) @@ -59,56 +59,56 @@ endif targets += raid6int1.c $(obj)/raid6int1.c: UNROLL := 1 -$(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE +$(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) targets += raid6int2.c $(obj)/raid6int2.c: UNROLL := 2 -$(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE +$(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) targets += raid6int4.c $(obj)/raid6int4.c: UNROLL := 4 -$(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE +$(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) targets += raid6int8.c $(obj)/raid6int8.c: UNROLL := 8 -$(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE +$(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) targets += raid6int16.c $(obj)/raid6int16.c: UNROLL := 16 -$(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE +$(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) targets += raid6int32.c $(obj)/raid6int32.c: UNROLL := 32 -$(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE +$(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) CFLAGS_raid6altivec1.o += $(altivec_flags) targets += raid6altivec1.c $(obj)/raid6altivec1.c: UNROLL := 1 -$(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE +$(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) CFLAGS_raid6altivec2.o += $(altivec_flags) targets += raid6altivec2.c $(obj)/raid6altivec2.c: UNROLL := 2 -$(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE +$(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) CFLAGS_raid6altivec4.o += $(altivec_flags) targets += raid6altivec4.c $(obj)/raid6altivec4.c: UNROLL := 4 -$(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE +$(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) CFLAGS_raid6altivec8.o += $(altivec_flags) targets += raid6altivec8.c $(obj)/raid6altivec8.c: UNROLL := 8 -$(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE +$(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) quiet_cmd_mktable = TABLE $@ diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc index 699dfeee4944..2654d5c854be 100644 --- a/drivers/md/raid6altivec.uc +++ b/drivers/md/raid6altivec.uc @@ -15,7 +15,7 @@ * * $#-way unrolled portable integer math RAID-6 instruction set * - * This file is postprocessed using unroll.pl + * This file is postprocessed using unroll.awk * * hpa: in process, * you can just "steal" the vec unit with enable_kernel_altivec() (but diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc index f9bf9cba357f..d1e276a14fab 100644 --- a/drivers/md/raid6int.uc +++ b/drivers/md/raid6int.uc @@ -15,7 +15,7 @@ * * $#-way unrolled portable integer math RAID-6 instruction set * - * This file is postprocessed using unroll.pl + * This file is postprocessed using unroll.awk */ #include diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile index 58ffdf4f5161..2874cbef529d 100644 --- a/drivers/md/raid6test/Makefile +++ b/drivers/md/raid6test/Makefile @@ -7,7 +7,7 @@ CC = gcc OPTFLAGS = -O2 # Adjust as desired CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) LD = ld -PERL = perl +AWK = awk AR = ar RANLIB = ranlib @@ -35,35 +35,35 @@ raid6.a: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \ raid6test: test.c raid6.a $(CC) $(CFLAGS) -o raid6test $^ -raid6altivec1.c: raid6altivec.uc ../unroll.pl - $(PERL) ../unroll.pl 1 < raid6altivec.uc > $@ +raid6altivec1.c: raid6altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < raid6altivec.uc > $@ -raid6altivec2.c: raid6altivec.uc ../unroll.pl - $(PERL) ../unroll.pl 2 < raid6altivec.uc > $@ +raid6altivec2.c: raid6altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < raid6altivec.uc > $@ -raid6altivec4.c: raid6altivec.uc ../unroll.pl - $(PERL) ../unroll.pl 4 < raid6altivec.uc > $@ +raid6altivec4.c: raid6altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < raid6altivec.uc > $@ -raid6altivec8.c: raid6altivec.uc ../unroll.pl - $(PERL) ../unroll.pl 8 < raid6altivec.uc > $@ +raid6altivec8.c: raid6altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < raid6altivec.uc > $@ -raid6int1.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 1 < raid6int.uc > $@ +raid6int1.c: raid6int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < raid6int.uc > $@ -raid6int2.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 2 < raid6int.uc > $@ +raid6int2.c: raid6int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < raid6int.uc > $@ -raid6int4.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 4 < raid6int.uc > $@ +raid6int4.c: raid6int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < raid6int.uc > $@ -raid6int8.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 8 < raid6int.uc > $@ +raid6int8.c: raid6int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < raid6int.uc > $@ -raid6int16.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 16 < raid6int.uc > $@ +raid6int16.c: raid6int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=16 < raid6int.uc > $@ -raid6int32.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 32 < raid6int.uc > $@ +raid6int32.c: raid6int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=32 < raid6int.uc > $@ raid6tables.c: mktables ./mktables > raid6tables.c diff --git a/drivers/md/unroll.awk b/drivers/md/unroll.awk new file mode 100644 index 000000000000..c6aa03631df8 --- /dev/null +++ b/drivers/md/unroll.awk @@ -0,0 +1,20 @@ + +# This filter requires one command line option of form -vN=n +# where n must be a decimal number. +# +# Repeat each input line containing $$ n times, replacing $$ with 0...n-1. +# Replace each $# with n, and each $* with a single $. + +BEGIN { + n = N + 0 +} +{ + if (/\$\$/) { rep = n } else { rep = 1 } + for (i = 0; i < rep; ++i) { + tmp = $0 + gsub(/\$\$/, i, tmp) + gsub(/\$\#/, n, tmp) + gsub(/\$\*/, "$", tmp) + print tmp + } +} diff --git a/drivers/md/unroll.pl b/drivers/md/unroll.pl deleted file mode 100644 index 3acc710a20ea..000000000000 --- a/drivers/md/unroll.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl -# -# Take a piece of C code and for each line which contains the sequence $$ -# repeat n times with $ replaced by 0...n-1; the sequence $# is replaced -# by the unrolling factor, and $* with a single $ -# - -($n) = @ARGV; -$n += 0; - -while ( defined($line = ) ) { - if ( $line =~ /\$\$/ ) { - $rep = $n; - } else { - $rep = 1; - } - for ( $i = 0 ; $i < $rep ; $i++ ) { - $tmp = $line; - $tmp =~ s/\$\$/$i/g; - $tmp =~ s/\$\#/$n/g; - $tmp =~ s/\$\*/\$/g; - print $tmp; - } -} -- cgit v1.2.2 From 417b8d4ac868cf58d6c68f52d72f7648413e0edc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 16 Oct 2009 16:25:22 +1100 Subject: md/raid456: downlevel multicore operations to raid_run_ops The percpu conversion allowed a straightforward handoff of stripe processing to the async subsytem that initially showed some modest gains (+4%). However, this model is too simplistic and leads to stripes bouncing between raid5d and the async thread pool for every invocation of handle_stripe(). As reported by Holger this can fall into a pathological situation severely impacting throughput (6x performance loss). By downleveling the parallelism to raid_run_ops the pathological stripe_head bouncing is eliminated. This version still exhibits an average 11% throughput loss for: mdadm --create /dev/md0 /dev/sd[b-q] -n 16 -l 6 echo 1024 > /sys/block/md0/md/stripe_cache_size dd if=/dev/zero of=/dev/md0 bs=1024k count=2048 ...but the results are at least stable and can be used as a base for further multicore experimentation. Reported-by: Holger Kiehl Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/raid5.c | 75 +++++++++++++++++++++++++++++------------------------- drivers/md/raid5.h | 12 ++++++++- 2 files changed, 51 insertions(+), 36 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c3e596778618..25c3c29134d1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1139,7 +1139,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu &sh->ops.zero_sum_result, percpu->spare_page, &submit); } -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; @@ -1204,6 +1204,36 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } +#ifdef CONFIG_MULTICORE_RAID456 +static void async_run_ops(void *param, async_cookie_t cookie) +{ + struct stripe_head *sh = param; + unsigned long ops_request = sh->ops.request; + + clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); + wake_up(&sh->ops.wait_for_ops); + + __raid_run_ops(sh, ops_request); + release_stripe(sh); +} + +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +{ + /* since handle_stripe can be called outside of raid5d context + * we need to ensure sh->ops.request is de-staged before another + * request arrives + */ + wait_event(sh->ops.wait_for_ops, + !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); + sh->ops.request = ops_request; + + atomic_inc(&sh->count); + async_schedule(async_run_ops, sh); +} +#else +#define raid_run_ops __raid_run_ops +#endif + static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; @@ -1213,6 +1243,9 @@ static int grow_one_stripe(raid5_conf_t *conf) memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); + #ifdef CONFIG_MULTICORE_RAID456 + init_waitqueue_head(&sh->ops.wait_for_ops); + #endif if (grow_buffers(sh, conf->raid_disks)) { shrink_buffers(sh, conf->raid_disks); @@ -1329,6 +1362,9 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) nsh->raid_conf = conf; spin_lock_init(&nsh->lock); + #ifdef CONFIG_MULTICORE_RAID456 + init_waitqueue_head(&nsh->ops.wait_for_ops); + #endif list_add(&nsh->lru, &newstripes); } @@ -4342,37 +4378,6 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) return handled; } -#ifdef CONFIG_MULTICORE_RAID456 -static void __process_stripe(void *param, async_cookie_t cookie) -{ - struct stripe_head *sh = param; - - handle_stripe(sh); - release_stripe(sh); -} - -static void process_stripe(struct stripe_head *sh, struct list_head *domain) -{ - async_schedule_domain(__process_stripe, sh, domain); -} - -static void synchronize_stripe_processing(struct list_head *domain) -{ - async_synchronize_full_domain(domain); -} -#else -static void process_stripe(struct stripe_head *sh, struct list_head *domain) -{ - handle_stripe(sh); - release_stripe(sh); - cond_resched(); -} - -static void synchronize_stripe_processing(struct list_head *domain) -{ -} -#endif - /* * This is our raid5 kernel thread. @@ -4386,7 +4391,6 @@ static void raid5d(mddev_t *mddev) struct stripe_head *sh; raid5_conf_t *conf = mddev->private; int handled; - LIST_HEAD(raid_domain); pr_debug("+++ raid5d active\n"); @@ -4423,7 +4427,9 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); handled++; - process_stripe(sh, &raid_domain); + handle_stripe(sh); + release_stripe(sh); + cond_resched(); spin_lock_irq(&conf->device_lock); } @@ -4431,7 +4437,6 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); - synchronize_stripe_processing(&raid_domain); async_tx_issue_pending_all(); unplug_slaves(mddev); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2390e0e83daf..dcefdc9629ee 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -214,12 +214,20 @@ struct stripe_head { int disks; /* disks in stripe */ enum check_states check_state; enum reconstruct_states reconstruct_state; - /* stripe_operations + /** + * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target + * @target2 - 2nd compute target in the raid6 case + * @zero_sum_result - P and Q verification flags + * @request - async service request flags for raid_run_ops */ struct stripe_operations { int target, target2; enum sum_check_flags zero_sum_result; + #ifdef CONFIG_MULTICORE_RAID456 + unsigned long request; + wait_queue_head_t wait_for_ops; + #endif } ops; struct r5dev { struct bio req; @@ -294,6 +302,8 @@ struct r6_state { #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ #define STRIPE_BIOFILL_RUN 14 #define STRIPE_COMPUTE_RUN 15 +#define STRIPE_OPS_REQ_PENDING 16 + /* * Operation request flags */ -- cgit v1.2.2 From e4424fee1815f996dccd36be44d68ca160ec3e1b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 16 Oct 2009 16:27:34 +1100 Subject: md: fix problems with RAID6 calculations for DDF. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 20 +++++++++++++------- drivers/md/raid5.h | 2 +- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 25c3c29134d1..d4ce51b4d41b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -158,11 +158,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, { int slot; + if (sh->ddf_layout) + slot = (*count)++; if (idx == sh->pd_idx) return syndrome_disks; if (idx == sh->qd_idx) return syndrome_disks + 1; - slot = (*count)++; + if (!sh->ddf_layout) + slot = (*count)++; return slot; } @@ -727,9 +730,8 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) srcs[slot] = sh->dev[i].page; i = raid6_next_disk(i, disks); } while (i != d0_idx); - BUG_ON(count != syndrome_disks); - return count; + return syndrome_disks; } static struct dma_async_tx_descriptor * @@ -828,7 +830,6 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) failb = slot; i = raid6_next_disk(i, disks); } while (i != d0_idx); - BUG_ON(count != syndrome_disks); BUG_ON(faila == failb); if (failb < faila) @@ -845,7 +846,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu)); - return async_gen_syndrome(blocks, 0, count+2, + return async_gen_syndrome(blocks, 0, syndrome_disks+2, STRIPE_SIZE, &submit); } else { struct page *dest; @@ -1935,10 +1936,15 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) case ALGORITHM_PARITY_N: break; case ALGORITHM_ROTATING_N_CONTINUE: + /* Like left_symmetric, but P is before Q */ if (sh->pd_idx == 0) i--; /* P D D D Q */ - else if (i > sh->pd_idx) - i -= 2; /* D D Q P D */ + else { + /* D D Q P D */ + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 1); + } break; case ALGORITHM_LEFT_ASYMMETRIC_6: case ALGORITHM_RIGHT_ASYMMETRIC_6: diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index dcefdc9629ee..dd708359b451 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -488,7 +488,7 @@ static inline int algorithm_valid_raid6(int layout) { return (layout >= 0 && layout <= 5) || - (layout == 8 || layout == 10) + (layout >= 8 && layout <= 10) || (layout >= 16 && layout <= 20); } -- cgit v1.2.2 From 5e5e3e78ed9038b8f7112835d07084eefb9daa47 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 16 Oct 2009 16:35:30 +1100 Subject: md: Fix handling of raid5 array which is being reshaped to fewer devices. When a raid5 (or raid6) array is being reshaped to have fewer devices, conf->raid_disks is the latter and hence smaller number of devices. However sometimes we want to use a number which is the total number of currently required devices - the larger of the 'old' and 'new' sizes. Before we implemented reducing the number of devices, this was always 'new' i.e. ->raid_disks. Now we need max(raid_disks, previous_raid_disks) in those places. This particularly affects assembling an array that was shutdown while in the middle of a reshape to fewer devices. md.c needs a similar fix when interpreting the md metadata. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- drivers/md/raid5.c | 37 ++++++++++++++++++------------------- 2 files changed, 19 insertions(+), 20 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 26ba42a79129..10eb1fce975e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2631,7 +2631,7 @@ static void analyze_sbs(mddev_t * mddev) rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= mddev->raid_disks) { + } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d4ce51b4d41b..c4366c9373c5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1238,22 +1238,22 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; + int disks = max(conf->raid_disks, conf->previous_raid_disks); sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); if (!sh) return 0; - memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); + memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&sh->ops.wait_for_ops); #endif - if (grow_buffers(sh, conf->raid_disks)) { - shrink_buffers(sh, conf->raid_disks); + if (grow_buffers(sh, disks)) { + shrink_buffers(sh, disks); kmem_cache_free(conf->slab_cache, sh); return 0; } - sh->disks = conf->raid_disks; /* we just created an active stripe so... */ atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); @@ -1265,7 +1265,7 @@ static int grow_one_stripe(raid5_conf_t *conf) static int grow_stripes(raid5_conf_t *conf, int num) { struct kmem_cache *sc; - int devs = conf->raid_disks; + int devs = max(conf->raid_disks, conf->previous_raid_disks); sprintf(conf->cache_name[0], "raid%d-%s", conf->level, mdname(conf->mddev)); @@ -3540,9 +3540,10 @@ static void unplug_slaves(mddev_t *mddev) { raid5_conf_t *conf = mddev->private; int i; + int devs = max(conf->raid_disks, conf->previous_raid_disks); rcu_read_lock(); - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < devs; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { struct request_queue *r_queue = bdev_get_queue(rdev->bdev); @@ -4562,13 +4563,9 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) if (!sectors) sectors = mddev->dev_sectors; - if (!raid_disks) { + if (!raid_disks) /* size is defined by the smallest of previous and new size */ - if (conf->raid_disks < conf->previous_raid_disks) - raid_disks = conf->raid_disks; - else - raid_disks = conf->previous_raid_disks; - } + raid_disks = min(conf->raid_disks, conf->previous_raid_disks); sectors &= ~((sector_t)mddev->chunk_sectors - 1); sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); @@ -4669,7 +4666,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf) } per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; } - scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); + scribble = kmalloc(conf->scribble_len, GFP_KERNEL); if (!scribble) { err = -ENOMEM; break; @@ -4690,7 +4687,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf) static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; - int raid_disk, memory; + int raid_disk, memory, max_disks; mdk_rdev_t *rdev; struct disk_info *disk; @@ -4740,13 +4737,14 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) conf->bypass_threshold = BYPASS_THRESHOLD; conf->raid_disks = mddev->raid_disks; - conf->scribble_len = scribble_len(conf->raid_disks); if (mddev->reshape_position == MaxSector) conf->previous_raid_disks = mddev->raid_disks; else conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; + max_disks = max(conf->raid_disks, conf->previous_raid_disks); + conf->scribble_len = scribble_len(max_disks); - conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), + conf->disks = kzalloc(max_disks * sizeof(struct disk_info), GFP_KERNEL); if (!conf->disks) goto abort; @@ -4764,7 +4762,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { raid_disk = rdev->raid_disk; - if (raid_disk >= conf->raid_disks + if (raid_disk >= max_disks || raid_disk < 0) continue; disk = conf->disks + raid_disk; @@ -4796,7 +4794,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) } memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; if (grow_stripes(conf, conf->max_nr_stripes)) { printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory); @@ -4921,7 +4919,8 @@ static int run(mddev_t *mddev) test_bit(In_sync, &rdev->flags)) working_disks++; - mddev->degraded = conf->raid_disks - working_disks; + mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) + - working_disks); if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" -- cgit v1.2.2 From 5dd33c9a4c29015f6d87568d33521c98931a387e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 16 Oct 2009 16:40:25 +1100 Subject: md/async: don't pass a memory pointer as a page pointer. md/raid6 passes a list of 'struct page *' to the async_tx routines, which then either DMA map them for offload, or take the page_address for CPU based calculations. For RAID6 we sometime leave 'blanks' in the list of pages. For CPU based calcs, we want to treat theses as a page of zeros. For offloaded calculations, we simply don't pass a page to the hardware. Currently the 'blanks' are encoded as a pointer to raid6_empty_zero_page. This is a 4096 byte memory region, not a 'struct page'. This is mostly handled correctly but is rather ugly. So change the code to pass and expect a NULL pointer for the blanks. When taking page_address of a page, we need to check for a NULL and in that case use raid6_empty_zero_page. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c4366c9373c5..dcd9e659ed9d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -720,7 +720,7 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) int i; for (i = 0; i < disks; i++) - srcs[i] = (void *)raid6_empty_zero_page; + srcs[i] = NULL; count = 0; i = d0_idx; @@ -816,7 +816,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) * slot number conversion for 'faila' and 'failb' */ for (i = 0; i < disks ; i++) - blocks[i] = (void *)raid6_empty_zero_page; + blocks[i] = NULL; count = 0; i = d0_idx; do { -- cgit v1.2.2 From 6d45d93ead319423099b82a4efd775bc0f159121 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 16 Oct 2009 23:18:14 +0100 Subject: dm snapshot: sort by chunk size to fix race Avoid a race causing corruption when snapshots of the same origin have different chunk sizes by sorting the internal list of snapshots by chunk size, largest first. https://bugzilla.redhat.com/show_bug.cgi?id=182659 For example, let's have two snapshots with different chunk sizes. The first snapshot (1) has small chunk size and the second snapshot (2) has large chunk size. Let's have chunks A, B, C in these snapshots: snapshot1: ====A==== ====B==== snapshot2: ==========C========== (Chunk size is a power of 2. Chunks are aligned.) A write to the origin at a position within A and C comes along. It triggers reallocation of A, then reallocation of C and links them together using A as the 'primary' exception. Then another write to the origin comes along at a position within B and C. It creates pending exception for B. C already has a reallocation in progress and it already has a primary exception (A), so nothing is done to it: B and C are not linked. If the reallocation of B finishes before the reallocation of C, because there is no link with the pending exception for C it does not know to wait for it and, the second write is dispatched to the origin and causes data corruption in the chunk C in snapshot2. To avoid this situation, we maintain snapshots sorted in descending order of chunk size. This leads to a guaranteed ordering on the links between the pending exceptions and avoids the problem explained above - both A and B now get linked to C. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 57f1bf7f3b7a..3a53a5a9bec8 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -296,6 +296,7 @@ static void __insert_origin(struct origin *o) */ static int register_snapshot(struct dm_snapshot *snap) { + struct dm_snapshot *l; struct origin *o, *new_o; struct block_device *bdev = snap->origin->bdev; @@ -319,7 +320,11 @@ static int register_snapshot(struct dm_snapshot *snap) __insert_origin(o); } - list_add_tail(&snap->list, &o->snapshots); + /* Sort the list according to chunk size, largest-first smallest-last */ + list_for_each_entry(l, &o->snapshots, list) + if (l->store->chunk_size < snap->store->chunk_size) + break; + list_add_tail(&snap->list, &l->list); up_write(&_origins_lock); return 0; -- cgit v1.2.2 From 034a186d29dbcef099e57ab23ec39440596be911 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 16 Oct 2009 23:18:14 +0100 Subject: dm snapshot: free exception store on init failure While initializing the snapshot module, if we fail to register the snapshot target then we must back-out the exception store module initialization. Cc: stable@kernel.org Signed-off-by: Jonathan Brassow Reviewed-by: Mikulas Patocka Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 3a53a5a9bec8..53f4063f7ea4 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1470,7 +1470,7 @@ static int __init dm_snapshot_init(void) r = dm_register_target(&snapshot_target); if (r) { DMERR("snapshot target register failed %d", r); - return r; + goto bad_register_snapshot_target; } r = dm_register_target(&origin_target); @@ -1527,6 +1527,9 @@ bad2: dm_unregister_target(&origin_target); bad1: dm_unregister_target(&snapshot_target); + +bad_register_snapshot_target: + dm_exception_store_exit(); return r; } -- cgit v1.2.2 From bca915aae803cf01fde4461fc9c093cf5a86d7fc Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 16 Oct 2009 23:18:15 +0100 Subject: dm log: userspace fix incorrect luid cast in userspace_ctr mips: drivers/md/dm-log-userspace-base.c: In function `userspace_ctr': drivers/md/dm-log-userspace-base.c:159: warning: cast from pointer to integer of different size Cc: stable@kernel.org Cc: Jonathan Brassow Signed-off-by: Andrew Morton Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log-userspace-base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 652bd33109e3..7ac2c1450d10 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -156,7 +156,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, } /* The ptr value is sufficient for local unique id */ - lc->luid = (uint64_t)lc; + lc->luid = (unsigned long)lc; lc->ti = ti; -- cgit v1.2.2 From 03022c54b9725026c0370a810168975c387ad04c Mon Sep 17 00:00:00 2001 From: Zdenek Kabelac Date: Fri, 16 Oct 2009 23:18:15 +0100 Subject: dm: add missing del_gendisk to alloc_dev error path Add missing del_gendisk() to error path when creation of workqueue fails. Otherwice there is a resource leak and following warning is shown: WARNING: at fs/sysfs/dir.c:487 sysfs_add_one+0xc5/0x160() sysfs: cannot create duplicate filename '/devices/virtual/block/dm-0' Cc: stable@kernel.org Signed-off-by: Zdenek Kabelac Reviewed-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 376f1ab48a24..c5f9918dab24 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1822,6 +1822,7 @@ static struct mapped_device *alloc_dev(int minor) bad_bdev: destroy_workqueue(md->wq); bad_thread: + del_gendisk(md->disk); put_disk(md->disk); bad_disk: blk_cleanup_queue(md->queue); -- cgit v1.2.2 From f88fb981183e71daf40bbd84bc8251bbf7b59e19 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Fri, 16 Oct 2009 23:18:15 +0100 Subject: dm: dec_pending needs locking to save error value Multiple instances of dec_pending() can run concurrently so a lock is needed when it saves the first error code. I have never experienced actual problem without locking and just found this during code inspection while implementing the barrier support patch for request-based dm. This patch adds the locking. I've done compile, boot and basic I/O testings. Cc: stable@kernel.org Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c5f9918dab24..724efc63904d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -47,6 +47,7 @@ struct dm_io { atomic_t io_count; struct bio *bio; unsigned long start_time; + spinlock_t endio_lock; }; /* @@ -578,8 +579,12 @@ static void dec_pending(struct dm_io *io, int error) struct mapped_device *md = io->md; /* Push-back supersedes any I/O errors */ - if (error && !(io->error > 0 && __noflush_suspending(md))) - io->error = error; + if (unlikely(error)) { + spin_lock_irqsave(&io->endio_lock, flags); + if (!(io->error > 0 && __noflush_suspending(md))) + io->error = error; + spin_unlock_irqrestore(&io->endio_lock, flags); + } if (atomic_dec_and_test(&io->io_count)) { if (io->error == DM_ENDIO_REQUEUE) { @@ -1226,6 +1231,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) atomic_set(&ci.io->io_count, 1); ci.io->bio = bio; ci.io->md = md; + spin_lock_init(&ci.io->endio_lock); ci.sector = bio->bi_sector; ci.sector_count = bio_sectors(bio); if (unlikely(bio_empty_barrier(bio))) -- cgit v1.2.2 From 3f2412dc85260e5aae7ebb03bf50d5b1407e3083 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 16 Oct 2009 23:18:16 +0100 Subject: dm snapshot: require non zero chunk size by end of ctr If we are creating snapshot with memory-stored exception store, fail if the user didn't specify chunk size. Zero chunk size would probably crash a lot of places in the rest of snapshot code. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Reviewed-by: Jonathan Brassow Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 53f4063f7ea4..9cb392b3e920 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -673,6 +673,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) bio_list_init(&s->queued_bios); INIT_WORK(&s->queued_bios_work, flush_queued_bios); + if (!s->store->chunk_size) { + ti->error = "Chunk size not set"; + goto bad_load_and_register; + } + /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ if (register_snapshot(s)) { -- cgit v1.2.2 From 0e8c4e4e3ebb15756ddc4170a88149a2cd323cfe Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 16 Oct 2009 23:18:16 +0100 Subject: dm exception store: fix failed set_chunk_size error path Properly close the device if failing because of an invalid chunk size. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 556acff3952f..e5de7627c52d 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -251,7 +251,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, r = set_chunk_size(tmp_store, argv[2], &ti->error); if (r) - goto bad_cow; + goto bad_ctr; r = type->ctr(tmp_store, 0, NULL); if (r) { -- cgit v1.2.2 From 4c6fff445d7aa753957856278d4d93bcad6e2c14 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 16 Oct 2009 23:18:16 +0100 Subject: dm snapshot: lock snapshot while supplying status This patch locks the snapshot when returning status. It fixes a race when it could return an invalid number of free chunks if someone was simultaneously modifying it. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 9cb392b3e920..9bc814aa2bbd 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1152,6 +1152,8 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, unsigned sz = 0; struct dm_snapshot *snap = ti->private; + down_write(&snap->lock); + switch (type) { case STATUSTYPE_INFO: if (!snap->valid) @@ -1183,6 +1185,8 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, break; } + up_write(&snap->lock); + return 0; } -- cgit v1.2.2 From df96eee679ba28c98cf722fa7c9f4286ee1ed0bd Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 16 Oct 2009 23:18:17 +0100 Subject: dm snapshot: use unsigned integer chunk size Use unsigned integer chunk size. Maximum chunk size is 512kB, there won't ever be need to use 4GB chunk size, so the number can be 32-bit. This fixes compiler failure on 32-bit systems with large block devices. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Reviewed-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.c | 20 +++++++++++--------- drivers/md/dm-exception-store.h | 8 ++++---- drivers/md/dm-snap-persistent.c | 16 ++++++++-------- drivers/md/dm-snap.c | 4 ++-- 4 files changed, 25 insertions(+), 23 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index e5de7627c52d..932d1b123143 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -155,7 +155,8 @@ static int set_chunk_size(struct dm_exception_store *store, char *value; chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); - if (*chunk_size_arg == '\0' || *value != '\0') { + if (*chunk_size_arg == '\0' || *value != '\0' || + chunk_size_ulong > UINT_MAX) { *error = "Invalid chunk size"; return -EINVAL; } @@ -171,34 +172,35 @@ static int set_chunk_size(struct dm_exception_store *store, */ chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9); - return dm_exception_store_set_chunk_size(store, chunk_size_ulong, + return dm_exception_store_set_chunk_size(store, + (unsigned) chunk_size_ulong, error); } int dm_exception_store_set_chunk_size(struct dm_exception_store *store, - unsigned long chunk_size_ulong, + unsigned chunk_size, char **error) { /* Check chunk_size is a power of 2 */ - if (!is_power_of_2(chunk_size_ulong)) { + if (!is_power_of_2(chunk_size)) { *error = "Chunk size is not a power of 2"; return -EINVAL; } /* Validate the chunk size against the device block size */ - if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) { + if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) { *error = "Chunk size is not a multiple of device blocksize"; return -EINVAL; } - if (chunk_size_ulong > INT_MAX >> SECTOR_SHIFT) { + if (chunk_size > INT_MAX >> SECTOR_SHIFT) { *error = "Chunk size is too high"; return -EINVAL; } - store->chunk_size = chunk_size_ulong; - store->chunk_mask = chunk_size_ulong - 1; - store->chunk_shift = ffs(chunk_size_ulong) - 1; + store->chunk_size = chunk_size; + store->chunk_mask = chunk_size - 1; + store->chunk_shift = ffs(chunk_size) - 1; return 0; } diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 812c71872ba0..8a223a48802c 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -101,9 +101,9 @@ struct dm_exception_store { struct dm_dev *cow; /* Size of data blocks saved - must be a power of 2 */ - chunk_t chunk_size; - chunk_t chunk_mask; - chunk_t chunk_shift; + unsigned chunk_size; + unsigned chunk_mask; + unsigned chunk_shift; void *context; }; @@ -169,7 +169,7 @@ int dm_exception_store_type_register(struct dm_exception_store_type *type); int dm_exception_store_type_unregister(struct dm_exception_store_type *type); int dm_exception_store_set_chunk_size(struct dm_exception_store *store, - unsigned long chunk_size_ulong, + unsigned chunk_size, char **error); int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index d5b2e08750d5..0c746420c008 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -284,12 +284,13 @@ static int read_header(struct pstore *ps, int *new_snapshot) { int r; struct disk_header *dh; - chunk_t chunk_size; + unsigned chunk_size; int chunk_size_supplied = 1; char *chunk_err; /* - * Use default chunk size (or hardsect_size, if larger) if none supplied + * Use default chunk size (or logical_block_size, if larger) + * if none supplied */ if (!ps->store->chunk_size) { ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, @@ -334,10 +335,9 @@ static int read_header(struct pstore *ps, int *new_snapshot) return 0; if (chunk_size_supplied) - DMWARN("chunk size %llu in device metadata overrides " - "table chunk size of %llu.", - (unsigned long long)chunk_size, - (unsigned long long)ps->store->chunk_size); + DMWARN("chunk size %u in device metadata overrides " + "table chunk size of %u.", + chunk_size, ps->store->chunk_size); /* We had a bogus chunk_size. Fix stuff up. */ free_area(ps); @@ -345,8 +345,8 @@ static int read_header(struct pstore *ps, int *new_snapshot) r = dm_exception_store_set_chunk_size(ps->store, chunk_size, &chunk_err); if (r) { - DMERR("invalid on-disk chunk size %llu: %s.", - (unsigned long long)chunk_size, chunk_err); + DMERR("invalid on-disk chunk size %u: %s.", + chunk_size, chunk_err); return r; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 9bc814aa2bbd..3a3ba46e6d4b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -961,7 +961,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) src.bdev = bdev; src.sector = chunk_to_sector(s->store, pe->e.old_chunk); - src.count = min(s->store->chunk_size, dev_size - src.sector); + src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); dest.bdev = s->store->cow->bdev; dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); @@ -1402,7 +1402,7 @@ static void origin_resume(struct dm_target *ti) struct dm_dev *dev = ti->private; struct dm_snapshot *snap; struct origin *o; - chunk_t chunk_size = 0; + unsigned chunk_size = 0; down_read(&_origins_lock); o = __lookup_origin(dev->bdev); -- cgit v1.2.2 From c1cc65caa19bb8a1b2e371000ef2719581db1691 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 16 Oct 2009 23:18:22 +0100 Subject: dm snapshot: allow chunk size to be less than page size Allow the snapshot chunk size to be smaller than the page size The code is now capable of handling this due to some previous fixes and enhancements. As the page size varies between computers, prior to this patch, the chunk size of a snapshot dictated which machines could read it: Snapshots created on one machine might not be readable on another. Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Reviewed-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 932d1b123143..7dbe652efb5a 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -138,16 +138,6 @@ int dm_exception_store_type_unregister(struct dm_exception_store_type *type) } EXPORT_SYMBOL(dm_exception_store_type_unregister); -/* - * Round a number up to the nearest 'size' boundary. size must - * be a power of 2. - */ -static ulong round_up(ulong n, ulong size) -{ - size--; - return (n + size) & ~size; -} - static int set_chunk_size(struct dm_exception_store *store, const char *chunk_size_arg, char **error) { @@ -166,12 +156,6 @@ static int set_chunk_size(struct dm_exception_store *store, return 0; } - /* - * Chunk size must be multiple of page size. Silently - * round up if it's not. - */ - chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9); - return dm_exception_store_set_chunk_size(store, (unsigned) chunk_size_ulong, error); -- cgit v1.2.2 From 6629542e79255e0dbef8ec82eaf644e1b2546c3c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 19 Oct 2009 18:09:32 -0700 Subject: md/raid6: kill a gcc-4.0.1 'uninitialized variable' warning Signed-off-by: Dan Williams --- drivers/md/raid5.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dcd9e659ed9d..81abefc172d9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -156,16 +156,16 @@ static inline int raid6_next_disk(int disk, int raid_disks) static int raid6_idx_to_slot(int idx, struct stripe_head *sh, int *count, int syndrome_disks) { - int slot; + int slot = *count; if (sh->ddf_layout) - slot = (*count)++; + (*count)++; if (idx == sh->pd_idx) return syndrome_disks; if (idx == sh->qd_idx) return syndrome_disks + 1; if (!sh->ddf_layout) - slot = (*count)++; + (*count)++; return slot; } -- cgit v1.2.2 From 24395a85d8efe6eee477ea35c73d045a8dd7a3a1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Nov 2009 14:59:27 +1100 Subject: md: don't clear endpoint for resync when resync is interrupted. If a 'sync_max' has been set (via sysfs), it is wrong to clear it until a resync (or reshape or recovery ...) actually reached that point. So if a resync is interrupted (e.g. by device failure), leave 'resync_max' unchanged. This is particularly important for 'reshape' operations that do not change the size of the array. For such operations mdadm needs to monitor the reshape taking rolling backups of the section being reshaped. If resync_max gets cleared, the reshape can get ahead of mdadm and then the backups that mdadm creates are useless. This is suitable for 2.6.31.y stable kernels. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 10eb1fce975e..e64c971038d1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6504,8 +6504,9 @@ void md_do_sync(mddev_t *mddev) skip: mddev->curr_resync = 0; mddev->curr_resync_completed = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + /* We completed so max setting can be forgotten. */ + mddev->resync_max = MaxSector; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); -- cgit v1.2.2 From 8dee7211467a56b7eb4e4359efb0aa4a72e1b6f3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Nov 2009 14:59:29 +1100 Subject: md/raid5: make sure curr_sync_completes is uptodate when reshape starts This value is visible through sysfs and is used by mdadm when it manages a reshape (backing up data that is about to be rearranged). So it is important that it is always correct. Current it does not get updated properly when a reshape starts which can cause problems when assembling an array that is in the middle of being reshaped. This is suitable for 2.6.31.y stable kernels. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81abefc172d9..dcce204b6c73 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4049,6 +4049,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); if (sector_nr) { + mddev->curr_resync_completed = sector_nr; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); *skipped = 1; return sector_nr; } -- cgit v1.2.2 From 5e8651060cea6b44844521ddcac665e2c021f5d8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 12 Nov 2009 12:08:04 +1100 Subject: md: factor out updating of 'recovery_offset'. Each device has its own 'recovery_offset' showing how far recovery has progressed on the device. As the only real significance of this is that fact that it can be stored in the metadata and recovered at restart, and as only 1.x metadata can do this, we were only updating 'recovery_offset' to 'curr_resync_completed' when updating v1.x metadata. But this is wrong, and we will shortly make limited use of this field in v0.90 metadata. So move the update into common code. Signed-off-by: NeilBrown --- drivers/md/md.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index e64c971038d1..01b9a0fd16e0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1382,8 +1382,6 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags)) { - if (mddev->curr_resync_completed > rdev->recovery_offset) - rdev->recovery_offset = mddev->curr_resync_completed; if (rdev->recovery_offset > 0) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); @@ -1917,6 +1915,14 @@ static void sync_sbs(mddev_t * mddev, int nospares) */ mdk_rdev_t *rdev; + /* First make sure individual recovery_offsets are correct */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + + } list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->sb_events == mddev->events || (nospares && -- cgit v1.2.2 From 0261cd9f1cb42fa44ece314d27868d83742bdf03 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 13 Nov 2009 17:40:48 +1100 Subject: md: allow v0.91 metadata to record devices as being active but not in-sync. This is a combination that didn't really make sense before. However when a reshape is converting e.g. raid5 -> raid6, the extra device is not fully in-sync, but is certainly active and contains important data. So allow that start to be meaningful and in particular get the 'recovery_offset' value (which is needed for any non-in-sync active device) from the reshape_position. Signed-off-by: NeilBrown --- drivers/md/md.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 01b9a0fd16e0..b182f86a19dd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -944,6 +944,14 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; + } else if (desc->state & (1<minor_version >= 91) { + rdev->recovery_offset = 0; + rdev->raid_disk = desc->raid_disk; + } } if (desc->state & (1<flags); @@ -1032,8 +1040,19 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) list_for_each_entry(rdev2, &mddev->disks, same_set) { mdp_disk_t *d; int desc_nr; - if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) - && !test_bit(Faulty, &rdev2->flags)) + int is_active = test_bit(In_sync, &rdev2->flags); + + if (rdev2->raid_disk >= 0 && + sb->minor_version >= 91) + /* we have nowhere to store the recovery_offset, + * but if it is not below the reshape_position, + * we can piggy-back on that. + */ + is_active = 1; + if (rdev2->raid_disk < 0 || + test_bit(Faulty, &rdev2->flags)) + is_active = 0; + if (is_active) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; @@ -1043,16 +1062,16 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); - if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) - && !test_bit(Faulty, &rdev2->flags)) + if (is_active) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ if (test_bit(Faulty, &rdev2->flags)) d->state = (1<flags)) { + else if (is_active) { d->state = (1<state |= (1<flags)) + d->state |= (1< Date: Fri, 13 Nov 2009 17:40:51 +1100 Subject: Don't unconditionally set in_sync on newly added device in raid5_reshape When a reshape finds that it can add spare devices into the array, those devices might already be 'in_sync' if they are beyond the old size of the array, or they might not if they are within the array. The first case happens when we change an N-drive RAID5 to an N+1-drive RAID5. The second happens when we convert an N-drive RAID5 to an N+1-drive RAID6. So set the flag more carefully. Also, ->recovery_offset is only meaningful when the flag is clear, so only set it in that case. This change needs the preceding two to ensure that the non-in_sync device doesn't get evicted from the array when it is stopped, in the case where v0.90 metadata is used. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dcce204b6c73..ab40529bdabe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5361,9 +5361,11 @@ static int raid5_start_reshape(mddev_t *mddev) !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; - set_bit(In_sync, &rdev->flags); + if (rdev->raid_disk >= conf->previous_raid_disks) + set_bit(In_sync, &rdev->flags); + else + rdev->recovery_offset = 0; added_devices++; - rdev->recovery_offset = 0; sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) -- cgit v1.2.2 From c148ffdcda00b6599b70f8b65e6a1fadd1dbb127 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 13 Nov 2009 17:47:00 +1100 Subject: md/raid5: Allow dirty-degraded arrays to be assembled when only party is degraded. Normally is it not safe to allow a raid5 that is both dirty and degraded to be assembled without explicit request from that admin, as it can cause hidden data corruption. This is because 'dirty' means that the parity cannot be trusted, and 'degraded' means that the parity needs to be used. However, if the device that is missing contains only parity, then there is no issue and assembly can continue. This particularly applies when a RAID5 is being converted to a RAID6 and there is an unclean shutdown while the conversion is happening. So check for whether the degraded space only contains parity, and in that case, allow the assembly. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab40529bdabe..d29215d966da 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4823,11 +4823,40 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) return ERR_PTR(-ENOMEM); } + +static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) +{ + switch (algo) { + case ALGORITHM_PARITY_0: + if (raid_disk < max_degraded) + return 1; + break; + case ALGORITHM_PARITY_N: + if (raid_disk >= raid_disks - max_degraded) + return 1; + break; + case ALGORITHM_PARITY_0_6: + if (raid_disk == 0 || + raid_disk == raid_disks - 1) + return 1; + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (raid_disk == raid_disks - 1) + return 1; + } + return 0; +} + static int run(mddev_t *mddev) { raid5_conf_t *conf; int working_disks = 0, chunk_size; + int dirty_parity_disks = 0; mdk_rdev_t *rdev; + sector_t reshape_offset = 0; if (mddev->recovery_cp != MaxSector) printk(KERN_NOTICE "raid5: %s is not clean" @@ -4861,6 +4890,7 @@ static int run(mddev_t *mddev) "on a stripe boundary\n"); return -EINVAL; } + reshape_offset = here_new * mddev->new_chunk_sectors; /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; sector_div(here_old, mddev->chunk_sectors * @@ -4916,10 +4946,51 @@ static int run(mddev_t *mddev) /* * 0 for a fully functional array, 1 or 2 for a degraded array. */ - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0 && - test_bit(In_sync, &rdev->flags)) + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk < 0) + continue; + if (test_bit(In_sync, &rdev->flags)) working_disks++; + /* This disc is not fully in-sync. However if it + * just stored parity (beyond the recovery_offset), + * when we don't need to be concerned about the + * array being dirty. + * When reshape goes 'backwards', we never have + * partially completed devices, so we only need + * to worry about reshape going forwards. + */ + /* Hack because v0.91 doesn't store recovery_offset properly. */ + if (mddev->major_version == 0 && + mddev->minor_version > 90) + rdev->recovery_offset = reshape_offset; + + printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n", + rdev->raid_disk, working_disks, conf->prev_algo, + conf->previous_raid_disks, conf->max_degraded, + conf->algorithm, conf->raid_disks, + only_parity(rdev->raid_disk, + conf->prev_algo, + conf->previous_raid_disks, + conf->max_degraded), + only_parity(rdev->raid_disk, + conf->algorithm, + conf->raid_disks, + conf->max_degraded)); + if (rdev->recovery_offset < reshape_offset) { + /* We need to check old and new layout */ + if (!only_parity(rdev->raid_disk, + conf->algorithm, + conf->raid_disks, + conf->max_degraded)) + continue; + } + if (!only_parity(rdev->raid_disk, + conf->prev_algo, + conf->previous_raid_disks, + conf->max_degraded)) + continue; + dirty_parity_disks++; + } mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) - working_disks); @@ -4935,7 +5006,7 @@ static int run(mddev_t *mddev) mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); mddev->resync_max_sectors = mddev->dev_sectors; - if (mddev->degraded > 0 && + if (mddev->degraded > dirty_parity_disks && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) printk(KERN_WARNING -- cgit v1.2.2