From 1442577bf6da1a31ae6555212202be9a2cec5642 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 16 Oct 2009 15:55:25 +1100
Subject: Revert "md: do not progress the resync process if the stripe was
 blocked"

This reverts commit df10cfbc4d7ab93260d997df754219d390d62a9d.

This patch was based on a misunderstanding and risks introducing a busy-wait loop.
So revert it.

Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 94829804ab7f..ec14ff5b508a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2896,7 +2896,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
  *
  */
 
-static bool handle_stripe5(struct stripe_head *sh)
+static void handle_stripe5(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks, i;
@@ -3167,11 +3167,9 @@ static bool handle_stripe5(struct stripe_head *sh)
 	ops_run_io(sh, &s);
 
 	return_io(return_bi);
-
-	return blocked_rdev == NULL;
 }
 
-static bool handle_stripe6(struct stripe_head *sh)
+static void handle_stripe6(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
@@ -3455,17 +3453,14 @@ static bool handle_stripe6(struct stripe_head *sh)
 	ops_run_io(sh, &s);
 
 	return_io(return_bi);
-
-	return blocked_rdev == NULL;
 }
 
-/* returns true if the stripe was handled */
-static bool handle_stripe(struct stripe_head *sh)
+static void handle_stripe(struct stripe_head *sh)
 {
 	if (sh->raid_conf->level == 6)
-		return handle_stripe6(sh);
+		handle_stripe6(sh);
 	else
-		return handle_stripe5(sh);
+		handle_stripe5(sh);
 }
 
 static void raid5_activate_delayed(raid5_conf_t *conf)
@@ -4277,9 +4272,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 	clear_bit(STRIPE_INSYNC, &sh->state);
 	spin_unlock(&sh->lock);
 
-	/* wait for any blocked device to be handled */
-	while (unlikely(!handle_stripe(sh)))
-		;
+	handle_stripe(sh);
 	release_stripe(sh);
 
 	return STRIPE_SECTORS;
-- 
cgit v1.2.2


From 1d9d52416c0445019ccc1f0fddb9a227456eb61b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 16 Oct 2009 15:55:32 +1100
Subject: md/raid1/raid10: add a cond_resched

During 'check' of a raid1 or raid10 it is possible for the management
thread to spend a lot of time running 'memcmp' on blocks from
different devices, so make sure the thread has a chance to schedule.
raid5d already has a cond_resched (in process_stripe).

Reported-By: Lee Howard <faxguy@howardsilvan.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 1 +
 drivers/md/raid10.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d1b9bd5fd4f6..71a01a2e1938 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1683,6 +1683,7 @@ static void raid1d(mddev_t *mddev)
 				generic_make_request(bio);
 			}
 		}
+		cond_resched();
 	}
 	if (unplug)
 		unplug_slaves(mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 51c4c5c4d87a..69fc76caa469 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1632,6 +1632,7 @@ static void raid10d(mddev_t *mddev)
 				generic_make_request(bio);
 			}
 		}
+		cond_resched();
 	}
 	if (unplug)
 		unplug_slaves(mddev);
-- 
cgit v1.2.2


From f5efd45ae597c96ed017afad5662b67d55b402a0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 16 Oct 2009 15:55:38 +1100
Subject: md/raid5: initialize conf->device_lock earlier

Deallocating a raid5_conf_t structure requires taking 'device_lock'.
Ensure it is initialized before it is used, i.e. initialize the lock
before attempting any further initializations that might fail.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ec14ff5b508a..c3e596778618 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4715,6 +4715,18 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
 	if (conf == NULL)
 		goto abort;
+	spin_lock_init(&conf->device_lock);
+	init_waitqueue_head(&conf->wait_for_stripe);
+	init_waitqueue_head(&conf->wait_for_overlap);
+	INIT_LIST_HEAD(&conf->handle_list);
+	INIT_LIST_HEAD(&conf->hold_list);
+	INIT_LIST_HEAD(&conf->delayed_list);
+	INIT_LIST_HEAD(&conf->bitmap_list);
+	INIT_LIST_HEAD(&conf->inactive_list);
+	atomic_set(&conf->active_stripes, 0);
+	atomic_set(&conf->preread_active_stripes, 0);
+	atomic_set(&conf->active_aligned_reads, 0);
+	conf->bypass_threshold = BYPASS_THRESHOLD;
 
 	conf->raid_disks = mddev->raid_disks;
 	conf->scribble_len = scribble_len(conf->raid_disks);
@@ -4737,19 +4749,6 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	if (raid5_alloc_percpu(conf) != 0)
 		goto abort;
 
-	spin_lock_init(&conf->device_lock);
-	init_waitqueue_head(&conf->wait_for_stripe);
-	init_waitqueue_head(&conf->wait_for_overlap);
-	INIT_LIST_HEAD(&conf->handle_list);
-	INIT_LIST_HEAD(&conf->hold_list);
-	INIT_LIST_HEAD(&conf->delayed_list);
-	INIT_LIST_HEAD(&conf->bitmap_list);
-	INIT_LIST_HEAD(&conf->inactive_list);
-	atomic_set(&conf->active_stripes, 0);
-	atomic_set(&conf->preread_active_stripes, 0);
-	atomic_set(&conf->active_aligned_reads, 0);
-	conf->bypass_threshold = BYPASS_THRESHOLD;
-
 	pr_debug("raid5: run(%s) called.\n", mdname(mddev));
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
-- 
cgit v1.2.2


From ed9bfdf1a40952fd0f8094ec77f876b84ead69af Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 16 Oct 2009 15:55:44 +1100
Subject: md: raid1/raid10: handle allocation errors during array setup.

Both raid1 and raid10 create a mempool during startup.
If the 'alloc' function for this mempool fails, unplug_slaves
is called.
If that happens when the pool is being initialised, unplug_slaves
will try to use the 'conf' structure that isn't filled in yet, and
badness will happen.

So ensure that unplug_slaves doesn't get called unless we know
that the conf structure if fully initialised.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 5 +++--
 drivers/md/raid10.c | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 71a01a2e1938..a053423785c9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -64,7 +64,7 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 
 	/* allocate a r1bio with room for raid_disks entries in the bios array */
 	r1_bio = kzalloc(size, gfp_flags);
-	if (!r1_bio)
+	if (!r1_bio && pi->mddev)
 		unplug_slaves(pi->mddev);
 
 	return r1_bio;
@@ -1979,13 +1979,14 @@ static int run(mddev_t *mddev)
 	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
 	if (!conf->poolinfo)
 		goto out_no_mem;
-	conf->poolinfo->mddev = mddev;
+	conf->poolinfo->mddev = NULL;
 	conf->poolinfo->raid_disks = mddev->raid_disks;
 	conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
 					  r1bio_pool_free,
 					  conf->poolinfo);
 	if (!conf->r1bio_pool)
 		goto out_no_mem;
+	conf->poolinfo->mddev = mddev;
 
 	spin_lock_init(&conf->device_lock);
 	mddev->queue->queue_lock = &conf->device_lock;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 69fc76caa469..c2cb7b87b440 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -68,7 +68,7 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 
 	/* allocate a r10bio with room for raid_disks entries in the bios array */
 	r10_bio = kzalloc(size, gfp_flags);
-	if (!r10_bio)
+	if (!r10_bio && conf->mddev)
 		unplug_slaves(conf->mddev);
 
 	return r10_bio;
@@ -2096,7 +2096,6 @@ static int run(mddev_t *mddev)
 	if (!conf->tmppage)
 		goto out_free_conf;
 
-	conf->mddev = mddev;
 	conf->raid_disks = mddev->raid_disks;
 	conf->near_copies = nc;
 	conf->far_copies = fc;
@@ -2133,6 +2132,7 @@ static int run(mddev_t *mddev)
 		goto out_free_conf;
 	}
 
+	conf->mddev = mddev;
 	spin_lock_init(&conf->device_lock);
 	mddev->queue->queue_lock = &conf->device_lock;
 
-- 
cgit v1.2.2


From ae8fa2831bbc5092ee9d1b90e623af881cf27140 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 16 Oct 2009 15:56:01 +1100
Subject: md: remove clumsy usage of do_sync_mapping_range from bitmap code

and replace with vfs_fsync which is much neater (but wasn't exported,
or even in existence at the time the code was written).

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 6986b0059d23..60e2b322db11 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1624,10 +1624,11 @@ int bitmap_create(mddev_t *mddev)
 	bitmap->offset = mddev->bitmap_offset;
 	if (file) {
 		get_file(file);
-		do_sync_mapping_range(file->f_mapping, 0, LLONG_MAX,
-				      SYNC_FILE_RANGE_WAIT_BEFORE |
-				      SYNC_FILE_RANGE_WRITE |
-				      SYNC_FILE_RANGE_WAIT_AFTER);
+		/* As future accesses to this file will use bmap,
+		 * and bypass the page cache, we must sync the file
+		 * first.
+		 */
+		vfs_fsync(file, file->f_dentry, 1);
 	}
 	/* read superblock from bitmap file (this sets bitmap->chunksize) */
 	err = bitmap_read_sb(bitmap);
-- 
cgit v1.2.2


From dce3a7a42d585b74ce68081010b42afe81c8f4c4 Mon Sep 17 00:00:00 2001
From: Vladimir Dronnikov <dronnikov@gmail.com>
Date: Fri, 16 Oct 2009 16:25:19 +1100
Subject: md: drivers/md/unroll.pl replaced with awk analog

drivers/md/unroll.pl replaced by awk script to drop build-time
dependency on perl

Signed-off-by: Vladimir Dronnikov <dronnikov@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/Makefile           | 22 +++++++++++-----------
 drivers/md/raid6altivec.uc    |  2 +-
 drivers/md/raid6int.uc        |  2 +-
 drivers/md/raid6test/Makefile | 42 +++++++++++++++++++++---------------------
 drivers/md/unroll.awk         | 20 ++++++++++++++++++++
 drivers/md/unroll.pl          | 24 ------------------------
 6 files changed, 54 insertions(+), 58 deletions(-)
 create mode 100644 drivers/md/unroll.awk
 delete mode 100644 drivers/md/unroll.pl

(limited to 'drivers/md')

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1dc4185bd781..e355e7f6a536 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -46,7 +46,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 
 quiet_cmd_unroll = UNROLL  $@
-      cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
+      cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \
                    < $< > $@ || ( rm -f $@ && exit 1 )
 
 ifeq ($(CONFIG_ALTIVEC),y)
@@ -59,56 +59,56 @@ endif
 
 targets += raid6int1.c
 $(obj)/raid6int1.c:   UNROLL := 1
-$(obj)/raid6int1.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+$(obj)/raid6int1.c:   $(src)/raid6int.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 targets += raid6int2.c
 $(obj)/raid6int2.c:   UNROLL := 2
-$(obj)/raid6int2.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+$(obj)/raid6int2.c:   $(src)/raid6int.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 targets += raid6int4.c
 $(obj)/raid6int4.c:   UNROLL := 4
-$(obj)/raid6int4.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+$(obj)/raid6int4.c:   $(src)/raid6int.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 targets += raid6int8.c
 $(obj)/raid6int8.c:   UNROLL := 8
-$(obj)/raid6int8.c:   $(src)/raid6int.uc $(src)/unroll.pl FORCE
+$(obj)/raid6int8.c:   $(src)/raid6int.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 targets += raid6int16.c
 $(obj)/raid6int16.c:  UNROLL := 16
-$(obj)/raid6int16.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE
+$(obj)/raid6int16.c:  $(src)/raid6int.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 targets += raid6int32.c
 $(obj)/raid6int32.c:  UNROLL := 32
-$(obj)/raid6int32.c:  $(src)/raid6int.uc $(src)/unroll.pl FORCE
+$(obj)/raid6int32.c:  $(src)/raid6int.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 CFLAGS_raid6altivec1.o += $(altivec_flags)
 targets += raid6altivec1.c
 $(obj)/raid6altivec1.c:   UNROLL := 1
-$(obj)/raid6altivec1.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+$(obj)/raid6altivec1.c:   $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 CFLAGS_raid6altivec2.o += $(altivec_flags)
 targets += raid6altivec2.c
 $(obj)/raid6altivec2.c:   UNROLL := 2
-$(obj)/raid6altivec2.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+$(obj)/raid6altivec2.c:   $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 CFLAGS_raid6altivec4.o += $(altivec_flags)
 targets += raid6altivec4.c
 $(obj)/raid6altivec4.c:   UNROLL := 4
-$(obj)/raid6altivec4.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+$(obj)/raid6altivec4.c:   $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 CFLAGS_raid6altivec8.o += $(altivec_flags)
 targets += raid6altivec8.c
 $(obj)/raid6altivec8.c:   UNROLL := 8
-$(obj)/raid6altivec8.c:   $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
+$(obj)/raid6altivec8.c:   $(src)/raid6altivec.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
 quiet_cmd_mktable = TABLE   $@
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc
index 699dfeee4944..2654d5c854be 100644
--- a/drivers/md/raid6altivec.uc
+++ b/drivers/md/raid6altivec.uc
@@ -15,7 +15,7 @@
  *
  * $#-way unrolled portable integer math RAID-6 instruction set
  *
- * This file is postprocessed using unroll.pl
+ * This file is postprocessed using unroll.awk
  *
  * <benh> hpa: in process,
  * you can just "steal" the vec unit with enable_kernel_altivec() (but
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc
index f9bf9cba357f..d1e276a14fab 100644
--- a/drivers/md/raid6int.uc
+++ b/drivers/md/raid6int.uc
@@ -15,7 +15,7 @@
  *
  * $#-way unrolled portable integer math RAID-6 instruction set
  *
- * This file is postprocessed using unroll.pl
+ * This file is postprocessed using unroll.awk
  */
 
 #include <linux/raid/pq.h>
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile
index 58ffdf4f5161..2874cbef529d 100644
--- a/drivers/md/raid6test/Makefile
+++ b/drivers/md/raid6test/Makefile
@@ -7,7 +7,7 @@ CC	 = gcc
 OPTFLAGS = -O2			# Adjust as desired
 CFLAGS	 = -I.. -I ../../../include -g $(OPTFLAGS)
 LD	 = ld
-PERL	 = perl
+AWK	 = awk
 AR	 = ar
 RANLIB	 = ranlib
 
@@ -35,35 +35,35 @@ raid6.a: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \
 raid6test: test.c raid6.a
 	$(CC) $(CFLAGS) -o raid6test $^
 
-raid6altivec1.c: raid6altivec.uc ../unroll.pl
-	$(PERL) ../unroll.pl 1 < raid6altivec.uc > $@
+raid6altivec1.c: raid6altivec.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=1 < raid6altivec.uc > $@
 
-raid6altivec2.c: raid6altivec.uc ../unroll.pl
-	$(PERL) ../unroll.pl 2 < raid6altivec.uc > $@
+raid6altivec2.c: raid6altivec.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=2 < raid6altivec.uc > $@
 
-raid6altivec4.c: raid6altivec.uc ../unroll.pl
-	$(PERL) ../unroll.pl 4 < raid6altivec.uc > $@
+raid6altivec4.c: raid6altivec.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=4 < raid6altivec.uc > $@
 
-raid6altivec8.c: raid6altivec.uc ../unroll.pl
-	$(PERL) ../unroll.pl 8 < raid6altivec.uc > $@
+raid6altivec8.c: raid6altivec.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=8 < raid6altivec.uc > $@
 
-raid6int1.c: raid6int.uc ../unroll.pl
-	$(PERL) ../unroll.pl 1 < raid6int.uc > $@
+raid6int1.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=1 < raid6int.uc > $@
 
-raid6int2.c: raid6int.uc ../unroll.pl
-	$(PERL) ../unroll.pl 2 < raid6int.uc > $@
+raid6int2.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=2 < raid6int.uc > $@
 
-raid6int4.c: raid6int.uc ../unroll.pl
-	$(PERL) ../unroll.pl 4 < raid6int.uc > $@
+raid6int4.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=4 < raid6int.uc > $@
 
-raid6int8.c: raid6int.uc ../unroll.pl
-	$(PERL) ../unroll.pl 8 < raid6int.uc > $@
+raid6int8.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=8 < raid6int.uc > $@
 
-raid6int16.c: raid6int.uc ../unroll.pl
-	$(PERL) ../unroll.pl 16 < raid6int.uc > $@
+raid6int16.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=16 < raid6int.uc > $@
 
-raid6int32.c: raid6int.uc ../unroll.pl
-	$(PERL) ../unroll.pl 32 < raid6int.uc > $@
+raid6int32.c: raid6int.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=32 < raid6int.uc > $@
 
 raid6tables.c: mktables
 	./mktables > raid6tables.c
diff --git a/drivers/md/unroll.awk b/drivers/md/unroll.awk
new file mode 100644
index 000000000000..c6aa03631df8
--- /dev/null
+++ b/drivers/md/unroll.awk
@@ -0,0 +1,20 @@
+
+# This filter requires one command line option of form -vN=n
+# where n must be a decimal number.
+#
+# Repeat each input line containing $$ n times, replacing $$ with 0...n-1.
+# Replace each $# with n, and each $* with a single $.
+
+BEGIN {
+	n = N + 0
+}
+{
+	if (/\$\$/) { rep = n } else { rep = 1 }
+	for (i = 0; i < rep; ++i) {
+		tmp = $0
+		gsub(/\$\$/, i, tmp)
+		gsub(/\$\#/, n, tmp)
+		gsub(/\$\*/, "$", tmp)
+		print tmp
+	}
+}
diff --git a/drivers/md/unroll.pl b/drivers/md/unroll.pl
deleted file mode 100644
index 3acc710a20ea..000000000000
--- a/drivers/md/unroll.pl
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/perl
-#
-# Take a piece of C code and for each line which contains the sequence $$
-# repeat n times with $ replaced by 0...n-1; the sequence $# is replaced
-# by the unrolling factor, and $* with a single $
-#
-
-($n) = @ARGV;
-$n += 0;
-
-while ( defined($line = <STDIN>) ) {
-    if ( $line =~ /\$\$/ ) {
-	$rep = $n;
-    } else {
-	$rep = 1;
-    }
-    for ( $i = 0 ; $i < $rep ; $i++ ) {
-	$tmp = $line;
-	$tmp =~ s/\$\$/$i/g;
-	$tmp =~ s/\$\#/$n/g;
-	$tmp =~ s/\$\*/\$/g;
-	print $tmp;
-    }
-}
-- 
cgit v1.2.2


From 417b8d4ac868cf58d6c68f52d72f7648413e0edc Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 16 Oct 2009 16:25:22 +1100
Subject: md/raid456: downlevel multicore operations to raid_run_ops

The percpu conversion allowed a straightforward handoff of stripe
processing to the async subsytem that initially showed some modest gains
(+4%).  However, this model is too simplistic and leads to stripes
bouncing between raid5d and the async thread pool for every invocation
of handle_stripe().  As reported by Holger this can fall into a
pathological situation severely impacting throughput (6x performance
loss).

By downleveling the parallelism to raid_run_ops the pathological
stripe_head bouncing is eliminated.  This version still exhibits an
average 11% throughput loss for:

	mdadm --create /dev/md0 /dev/sd[b-q] -n 16 -l 6
	echo 1024 > /sys/block/md0/md/stripe_cache_size
	dd if=/dev/zero of=/dev/md0 bs=1024k count=2048

...but the results are at least stable and can be used as a base for
further multicore experimentation.

Reported-by: Holger Kiehl <Holger.Kiehl@dwd.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 75 +++++++++++++++++++++++++++++-------------------------
 drivers/md/raid5.h | 12 ++++++++-
 2 files changed, 51 insertions(+), 36 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c3e596778618..25c3c29134d1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,7 +1139,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
 
-static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -1204,6 +1204,36 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	put_cpu();
 }
 
+#ifdef CONFIG_MULTICORE_RAID456
+static void async_run_ops(void *param, async_cookie_t cookie)
+{
+	struct stripe_head *sh = param;
+	unsigned long ops_request = sh->ops.request;
+
+	clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
+	wake_up(&sh->ops.wait_for_ops);
+
+	__raid_run_ops(sh, ops_request);
+	release_stripe(sh);
+}
+
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
+{
+	/* since handle_stripe can be called outside of raid5d context
+	 * we need to ensure sh->ops.request is de-staged before another
+	 * request arrives
+	 */
+	wait_event(sh->ops.wait_for_ops,
+		   !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
+	sh->ops.request = ops_request;
+
+	atomic_inc(&sh->count);
+	async_schedule(async_run_ops, sh);
+}
+#else
+#define raid_run_ops __raid_run_ops
+#endif
+
 static int grow_one_stripe(raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
@@ -1213,6 +1243,9 @@ static int grow_one_stripe(raid5_conf_t *conf)
 	memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
 	sh->raid_conf = conf;
 	spin_lock_init(&sh->lock);
+	#ifdef CONFIG_MULTICORE_RAID456
+	init_waitqueue_head(&sh->ops.wait_for_ops);
+	#endif
 
 	if (grow_buffers(sh, conf->raid_disks)) {
 		shrink_buffers(sh, conf->raid_disks);
@@ -1329,6 +1362,9 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 
 		nsh->raid_conf = conf;
 		spin_lock_init(&nsh->lock);
+		#ifdef CONFIG_MULTICORE_RAID456
+		init_waitqueue_head(&nsh->ops.wait_for_ops);
+		#endif
 
 		list_add(&nsh->lru, &newstripes);
 	}
@@ -4342,37 +4378,6 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 	return handled;
 }
 
-#ifdef CONFIG_MULTICORE_RAID456
-static void __process_stripe(void *param, async_cookie_t cookie)
-{
-	struct stripe_head *sh = param;
-
-	handle_stripe(sh);
-	release_stripe(sh);
-}
-
-static void process_stripe(struct stripe_head *sh, struct list_head *domain)
-{
-	async_schedule_domain(__process_stripe, sh, domain);
-}
-
-static void synchronize_stripe_processing(struct list_head *domain)
-{
-	async_synchronize_full_domain(domain);
-}
-#else
-static void process_stripe(struct stripe_head *sh, struct list_head *domain)
-{
-	handle_stripe(sh);
-	release_stripe(sh);
-	cond_resched();
-}
-
-static void synchronize_stripe_processing(struct list_head *domain)
-{
-}
-#endif
-
 
 /*
  * This is our raid5 kernel thread.
@@ -4386,7 +4391,6 @@ static void raid5d(mddev_t *mddev)
 	struct stripe_head *sh;
 	raid5_conf_t *conf = mddev->private;
 	int handled;
-	LIST_HEAD(raid_domain);
 
 	pr_debug("+++ raid5d active\n");
 
@@ -4423,7 +4427,9 @@ static void raid5d(mddev_t *mddev)
 		spin_unlock_irq(&conf->device_lock);
 		
 		handled++;
-		process_stripe(sh, &raid_domain);
+		handle_stripe(sh);
+		release_stripe(sh);
+		cond_resched();
 
 		spin_lock_irq(&conf->device_lock);
 	}
@@ -4431,7 +4437,6 @@ static void raid5d(mddev_t *mddev)
 
 	spin_unlock_irq(&conf->device_lock);
 
-	synchronize_stripe_processing(&raid_domain);
 	async_tx_issue_pending_all();
 	unplug_slaves(mddev);
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2390e0e83daf..dcefdc9629ee 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -214,12 +214,20 @@ struct stripe_head {
 	int			disks;		/* disks in stripe */
 	enum check_states	check_state;
 	enum reconstruct_states reconstruct_state;
-	/* stripe_operations
+	/**
+	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
+	 * @target2 - 2nd compute target in the raid6 case
+	 * @zero_sum_result - P and Q verification flags
+	 * @request - async service request flags for raid_run_ops
 	 */
 	struct stripe_operations {
 		int 		     target, target2;
 		enum sum_check_flags zero_sum_result;
+		#ifdef CONFIG_MULTICORE_RAID456
+		unsigned long	     request;
+		wait_queue_head_t    wait_for_ops;
+		#endif
 	} ops;
 	struct r5dev {
 		struct bio	req;
@@ -294,6 +302,8 @@ struct r6_state {
 #define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
 #define	STRIPE_BIOFILL_RUN	14
 #define	STRIPE_COMPUTE_RUN	15
+#define	STRIPE_OPS_REQ_PENDING	16
+
 /*
  * Operation request flags
  */
-- 
cgit v1.2.2


From e4424fee1815f996dccd36be44d68ca160ec3e1b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 16 Oct 2009 16:27:34 +1100
Subject: md: fix problems with RAID6 calculations for DDF.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 20 +++++++++++++-------
 drivers/md/raid5.h |  2 +-
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 25c3c29134d1..d4ce51b4d41b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -158,11 +158,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 {
 	int slot;
 
+	if (sh->ddf_layout)
+		slot = (*count)++;
 	if (idx == sh->pd_idx)
 		return syndrome_disks;
 	if (idx == sh->qd_idx)
 		return syndrome_disks + 1;
-	slot = (*count)++;
+	if (!sh->ddf_layout)
+		slot = (*count)++;
 	return slot;
 }
 
@@ -727,9 +730,8 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
 		srcs[slot] = sh->dev[i].page;
 		i = raid6_next_disk(i, disks);
 	} while (i != d0_idx);
-	BUG_ON(count != syndrome_disks);
 
-	return count;
+	return syndrome_disks;
 }
 
 static struct dma_async_tx_descriptor *
@@ -828,7 +830,6 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 			failb = slot;
 		i = raid6_next_disk(i, disks);
 	} while (i != d0_idx);
-	BUG_ON(count != syndrome_disks);
 
 	BUG_ON(faila == failb);
 	if (failb < faila)
@@ -845,7 +846,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 					  ops_complete_compute, sh,
 					  to_addr_conv(sh, percpu));
-			return async_gen_syndrome(blocks, 0, count+2,
+			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
 						  STRIPE_SIZE, &submit);
 		} else {
 			struct page *dest;
@@ -1935,10 +1936,15 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 		case ALGORITHM_PARITY_N:
 			break;
 		case ALGORITHM_ROTATING_N_CONTINUE:
+			/* Like left_symmetric, but P is before Q */
 			if (sh->pd_idx == 0)
 				i--;	/* P D D D Q */
-			else if (i > sh->pd_idx)
-				i -= 2; /* D D Q P D */
+			else {
+				/* D D Q P D */
+				if (i < sh->pd_idx)
+					i += raid_disks;
+				i -= (sh->pd_idx + 1);
+			}
 			break;
 		case ALGORITHM_LEFT_ASYMMETRIC_6:
 		case ALGORITHM_RIGHT_ASYMMETRIC_6:
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index dcefdc9629ee..dd708359b451 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -488,7 +488,7 @@ static inline int algorithm_valid_raid6(int layout)
 {
 	return (layout >= 0 && layout <= 5)
 		||
-		(layout == 8 || layout == 10)
+		(layout >= 8 && layout <= 10)
 		||
 		(layout >= 16 && layout <= 20);
 }
-- 
cgit v1.2.2


From 5e5e3e78ed9038b8f7112835d07084eefb9daa47 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 16 Oct 2009 16:35:30 +1100
Subject: md: Fix handling of raid5 array which is being reshaped to fewer
 devices.

When a raid5 (or raid6) array is being reshaped to have fewer devices,
conf->raid_disks is the latter and hence smaller number of devices.
However sometimes we want to use a number which is the total number of
currently required devices - the larger of the 'old' and 'new' sizes.
Before we implemented reducing the number of devices, this was always
'new' i.e. ->raid_disks.
Now we need max(raid_disks, previous_raid_disks) in those places.

This particularly affects assembling an array that was shutdown while
in the middle of a reshape to fewer devices.

md.c needs a similar fix when interpreting the md metadata.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    |  2 +-
 drivers/md/raid5.c | 37 ++++++++++++++++++-------------------
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 26ba42a79129..10eb1fce975e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2631,7 +2631,7 @@ static void analyze_sbs(mddev_t * mddev)
 			rdev->desc_nr = i++;
 			rdev->raid_disk = rdev->desc_nr;
 			set_bit(In_sync, &rdev->flags);
-		} else if (rdev->raid_disk >= mddev->raid_disks) {
+		} else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
 			rdev->raid_disk = -1;
 			clear_bit(In_sync, &rdev->flags);
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d4ce51b4d41b..c4366c9373c5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1238,22 +1238,22 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 static int grow_one_stripe(raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
+	int disks = max(conf->raid_disks, conf->previous_raid_disks);
 	sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
 	if (!sh)
 		return 0;
-	memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
+	memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev));
 	sh->raid_conf = conf;
 	spin_lock_init(&sh->lock);
 	#ifdef CONFIG_MULTICORE_RAID456
 	init_waitqueue_head(&sh->ops.wait_for_ops);
 	#endif
 
-	if (grow_buffers(sh, conf->raid_disks)) {
-		shrink_buffers(sh, conf->raid_disks);
+	if (grow_buffers(sh, disks)) {
+		shrink_buffers(sh, disks);
 		kmem_cache_free(conf->slab_cache, sh);
 		return 0;
 	}
-	sh->disks = conf->raid_disks;
 	/* we just created an active stripe so... */
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
@@ -1265,7 +1265,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
 static int grow_stripes(raid5_conf_t *conf, int num)
 {
 	struct kmem_cache *sc;
-	int devs = conf->raid_disks;
+	int devs = max(conf->raid_disks, conf->previous_raid_disks);
 
 	sprintf(conf->cache_name[0],
 		"raid%d-%s", conf->level, mdname(conf->mddev));
@@ -3540,9 +3540,10 @@ static void unplug_slaves(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev->private;
 	int i;
+	int devs = max(conf->raid_disks, conf->previous_raid_disks);
 
 	rcu_read_lock();
-	for (i = 0; i < conf->raid_disks; i++) {
+	for (i = 0; i < devs; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
 			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
@@ -4562,13 +4563,9 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 
 	if (!sectors)
 		sectors = mddev->dev_sectors;
-	if (!raid_disks) {
+	if (!raid_disks)
 		/* size is defined by the smallest of previous and new size */
-		if (conf->raid_disks < conf->previous_raid_disks)
-			raid_disks = conf->raid_disks;
-		else
-			raid_disks = conf->previous_raid_disks;
-	}
+		raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
 
 	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
 	sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
@@ -4669,7 +4666,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
 			}
 			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
 		}
-		scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
+		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
 		if (!scribble) {
 			err = -ENOMEM;
 			break;
@@ -4690,7 +4687,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
 static raid5_conf_t *setup_conf(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
-	int raid_disk, memory;
+	int raid_disk, memory, max_disks;
 	mdk_rdev_t *rdev;
 	struct disk_info *disk;
 
@@ -4740,13 +4737,14 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	conf->bypass_threshold = BYPASS_THRESHOLD;
 
 	conf->raid_disks = mddev->raid_disks;
-	conf->scribble_len = scribble_len(conf->raid_disks);
 	if (mddev->reshape_position == MaxSector)
 		conf->previous_raid_disks = mddev->raid_disks;
 	else
 		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
+	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
+	conf->scribble_len = scribble_len(max_disks);
 
-	conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
+	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
 	if (!conf->disks)
 		goto abort;
@@ -4764,7 +4762,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		raid_disk = rdev->raid_disk;
-		if (raid_disk >= conf->raid_disks
+		if (raid_disk >= max_disks
 		    || raid_disk < 0)
 			continue;
 		disk = conf->disks + raid_disk;
@@ -4796,7 +4794,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	}
 
 	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-		 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
+		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	if (grow_stripes(conf, conf->max_nr_stripes)) {
 		printk(KERN_ERR
 			"raid5: couldn't allocate %dkB for buffers\n", memory);
@@ -4921,7 +4919,8 @@ static int run(mddev_t *mddev)
 		    test_bit(In_sync, &rdev->flags))
 			working_disks++;
 
-	mddev->degraded = conf->raid_disks - working_disks;
+	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
+			   - working_disks);
 
 	if (mddev->degraded > conf->max_degraded) {
 		printk(KERN_ERR "raid5: not enough operational devices for %s"
-- 
cgit v1.2.2


From 5dd33c9a4c29015f6d87568d33521c98931a387e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 16 Oct 2009 16:40:25 +1100
Subject: md/async: don't pass a memory pointer as a page pointer.

md/raid6 passes a list of 'struct page *' to the async_tx routines,
which then either DMA map them for offload, or take the page_address
for CPU based calculations.

For RAID6 we sometime leave 'blanks' in the list of pages.
For CPU based calcs, we want to treat theses as a page of zeros.
For offloaded calculations, we simply don't pass a page to the
hardware.

Currently the 'blanks' are encoded as a pointer to
raid6_empty_zero_page.  This is a 4096 byte memory region, not a
'struct page'.  This is mostly handled correctly but is rather ugly.

So change the code to pass and expect a NULL pointer for the blanks.
When taking page_address of a page, we need to check for a NULL and
in that case use raid6_empty_zero_page.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c4366c9373c5..dcd9e659ed9d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -720,7 +720,7 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
 	int i;
 
 	for (i = 0; i < disks; i++)
-		srcs[i] = (void *)raid6_empty_zero_page;
+		srcs[i] = NULL;
 
 	count = 0;
 	i = d0_idx;
@@ -816,7 +816,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 	 * slot number conversion for 'faila' and 'failb'
 	 */
 	for (i = 0; i < disks ; i++)
-		blocks[i] = (void *)raid6_empty_zero_page;
+		blocks[i] = NULL;
 	count = 0;
 	i = d0_idx;
 	do {
-- 
cgit v1.2.2


From 6d45d93ead319423099b82a4efd775bc0f159121 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 16 Oct 2009 23:18:14 +0100
Subject: dm snapshot: sort by chunk size to fix race

Avoid a race causing corruption when snapshots of the same origin have
different chunk sizes by sorting the internal list of snapshots by chunk
size, largest first.
  https://bugzilla.redhat.com/show_bug.cgi?id=182659

For example, let's have two snapshots with different chunk sizes. The
first snapshot (1) has small chunk size and the second snapshot (2) has
large chunk size.  Let's have chunks A, B, C in these snapshots:
snapshot1: ====A====   ====B====
snapshot2: ==========C==========

(Chunk size is a power of 2. Chunks are aligned.)

A write to the origin at a position within A and C comes along. It
triggers reallocation of A, then reallocation of C and links them
together using A as the 'primary' exception.

Then another write to the origin comes along at a position within B and
C.  It creates pending exception for B.  C already has a reallocation in
progress and it already has a primary exception (A), so nothing is done
to it: B and C are not linked.

If the reallocation of B finishes before the reallocation of C, because
there is no link with the pending exception for C it does not know to
wait for it and, the second write is dispatched to the origin and causes
data corruption in the chunk C in snapshot2.

To avoid this situation, we maintain snapshots sorted in descending
order of chunk size.  This leads to a guaranteed ordering on the links
between the pending exceptions and avoids the problem explained above -
both A and B now get linked to C.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 57f1bf7f3b7a..3a53a5a9bec8 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -296,6 +296,7 @@ static void __insert_origin(struct origin *o)
  */
 static int register_snapshot(struct dm_snapshot *snap)
 {
+	struct dm_snapshot *l;
 	struct origin *o, *new_o;
 	struct block_device *bdev = snap->origin->bdev;
 
@@ -319,7 +320,11 @@ static int register_snapshot(struct dm_snapshot *snap)
 		__insert_origin(o);
 	}
 
-	list_add_tail(&snap->list, &o->snapshots);
+	/* Sort the list according to chunk size, largest-first smallest-last */
+	list_for_each_entry(l, &o->snapshots, list)
+		if (l->store->chunk_size < snap->store->chunk_size)
+			break;
+	list_add_tail(&snap->list, &l->list);
 
 	up_write(&_origins_lock);
 	return 0;
-- 
cgit v1.2.2


From 034a186d29dbcef099e57ab23ec39440596be911 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Fri, 16 Oct 2009 23:18:14 +0100
Subject: dm snapshot: free exception store on init failure

While initializing the snapshot module, if we fail to register
the snapshot target then we must back-out the exception store
module initialization.

Cc: stable@kernel.org
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3a53a5a9bec8..53f4063f7ea4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1470,7 +1470,7 @@ static int __init dm_snapshot_init(void)
 	r = dm_register_target(&snapshot_target);
 	if (r) {
 		DMERR("snapshot target register failed %d", r);
-		return r;
+		goto bad_register_snapshot_target;
 	}
 
 	r = dm_register_target(&origin_target);
@@ -1527,6 +1527,9 @@ bad2:
 	dm_unregister_target(&origin_target);
 bad1:
 	dm_unregister_target(&snapshot_target);
+
+bad_register_snapshot_target:
+	dm_exception_store_exit();
 	return r;
 }
 
-- 
cgit v1.2.2


From bca915aae803cf01fde4461fc9c093cf5a86d7fc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 16 Oct 2009 23:18:15 +0100
Subject: dm log: userspace fix incorrect luid cast in userspace_ctr

mips:

drivers/md/dm-log-userspace-base.c: In function `userspace_ctr':
drivers/md/dm-log-userspace-base.c:159: warning: cast from pointer to integer of different size

Cc: stable@kernel.org
Cc: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 652bd33109e3..7ac2c1450d10 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -156,7 +156,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 	}
 
 	/* The ptr value is sufficient for local unique id */
-	lc->luid = (uint64_t)lc;
+	lc->luid = (unsigned long)lc;
 
 	lc->ti = ti;
 
-- 
cgit v1.2.2


From 03022c54b9725026c0370a810168975c387ad04c Mon Sep 17 00:00:00 2001
From: Zdenek Kabelac <zkabelac@redhat.com>
Date: Fri, 16 Oct 2009 23:18:15 +0100
Subject: dm: add missing del_gendisk to alloc_dev error path

Add missing del_gendisk() to error path when creation of workqueue fails.
Otherwice there is a resource leak and following warning is shown:

WARNING: at fs/sysfs/dir.c:487 sysfs_add_one+0xc5/0x160()
sysfs: cannot create duplicate filename '/devices/virtual/block/dm-0'

Cc: stable@kernel.org
Signed-off-by: Zdenek Kabelac <zkabelac@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 376f1ab48a24..c5f9918dab24 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1822,6 +1822,7 @@ static struct mapped_device *alloc_dev(int minor)
 bad_bdev:
 	destroy_workqueue(md->wq);
 bad_thread:
+	del_gendisk(md->disk);
 	put_disk(md->disk);
 bad_disk:
 	blk_cleanup_queue(md->queue);
-- 
cgit v1.2.2


From f88fb981183e71daf40bbd84bc8251bbf7b59e19 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Fri, 16 Oct 2009 23:18:15 +0100
Subject: dm: dec_pending needs locking to save error value

Multiple instances of dec_pending() can run concurrently so a lock is
needed when it saves the first error code.

I have never experienced actual problem without locking and just found
this during code inspection while implementing the barrier support
patch for request-based dm.

This patch adds the locking.
I've done compile, boot and basic I/O testings.

Cc: stable@kernel.org
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c5f9918dab24..724efc63904d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -47,6 +47,7 @@ struct dm_io {
 	atomic_t io_count;
 	struct bio *bio;
 	unsigned long start_time;
+	spinlock_t endio_lock;
 };
 
 /*
@@ -578,8 +579,12 @@ static void dec_pending(struct dm_io *io, int error)
 	struct mapped_device *md = io->md;
 
 	/* Push-back supersedes any I/O errors */
-	if (error && !(io->error > 0 && __noflush_suspending(md)))
-		io->error = error;
+	if (unlikely(error)) {
+		spin_lock_irqsave(&io->endio_lock, flags);
+		if (!(io->error > 0 && __noflush_suspending(md)))
+			io->error = error;
+		spin_unlock_irqrestore(&io->endio_lock, flags);
+	}
 
 	if (atomic_dec_and_test(&io->io_count)) {
 		if (io->error == DM_ENDIO_REQUEUE) {
@@ -1226,6 +1231,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	atomic_set(&ci.io->io_count, 1);
 	ci.io->bio = bio;
 	ci.io->md = md;
+	spin_lock_init(&ci.io->endio_lock);
 	ci.sector = bio->bi_sector;
 	ci.sector_count = bio_sectors(bio);
 	if (unlikely(bio_empty_barrier(bio)))
-- 
cgit v1.2.2


From 3f2412dc85260e5aae7ebb03bf50d5b1407e3083 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 16 Oct 2009 23:18:16 +0100
Subject: dm snapshot: require non zero chunk size by end of ctr

If we are creating snapshot with memory-stored exception store, fail if
the user didn't specify chunk size. Zero chunk size would probably crash
a lot of places in the rest of snapshot code.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 53f4063f7ea4..9cb392b3e920 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -673,6 +673,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	bio_list_init(&s->queued_bios);
 	INIT_WORK(&s->queued_bios_work, flush_queued_bios);
 
+	if (!s->store->chunk_size) {
+		ti->error = "Chunk size not set";
+		goto bad_load_and_register;
+	}
+
 	/* Add snapshot to the list of snapshots for this origin */
 	/* Exceptions aren't triggered till snapshot_resume() is called */
 	if (register_snapshot(s)) {
-- 
cgit v1.2.2


From 0e8c4e4e3ebb15756ddc4170a88149a2cd323cfe Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 16 Oct 2009 23:18:16 +0100
Subject: dm exception store: fix failed set_chunk_size error path

Properly close the device if failing because of an invalid chunk size.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 556acff3952f..e5de7627c52d 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -251,7 +251,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 
 	r = set_chunk_size(tmp_store, argv[2], &ti->error);
 	if (r)
-		goto bad_cow;
+		goto bad_ctr;
 
 	r = type->ctr(tmp_store, 0, NULL);
 	if (r) {
-- 
cgit v1.2.2


From 4c6fff445d7aa753957856278d4d93bcad6e2c14 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 16 Oct 2009 23:18:16 +0100
Subject: dm snapshot: lock snapshot while supplying status

This patch locks the snapshot when returning status.  It fixes a race
when it could return an invalid number of free chunks if someone
was simultaneously modifying it.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9cb392b3e920..9bc814aa2bbd 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1152,6 +1152,8 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 	unsigned sz = 0;
 	struct dm_snapshot *snap = ti->private;
 
+	down_write(&snap->lock);
+
 	switch (type) {
 	case STATUSTYPE_INFO:
 		if (!snap->valid)
@@ -1183,6 +1185,8 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
 		break;
 	}
 
+	up_write(&snap->lock);
+
 	return 0;
 }
 
-- 
cgit v1.2.2


From df96eee679ba28c98cf722fa7c9f4286ee1ed0bd Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 16 Oct 2009 23:18:17 +0100
Subject: dm snapshot: use unsigned integer chunk size

Use unsigned integer chunk size.

Maximum chunk size is 512kB, there won't ever be need to use 4GB chunk size,
so the number can be 32-bit. This fixes compiler failure on 32-bit systems
with large block devices.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c | 20 +++++++++++---------
 drivers/md/dm-exception-store.h |  8 ++++----
 drivers/md/dm-snap-persistent.c | 16 ++++++++--------
 drivers/md/dm-snap.c            |  4 ++--
 4 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index e5de7627c52d..932d1b123143 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -155,7 +155,8 @@ static int set_chunk_size(struct dm_exception_store *store,
 	char *value;
 
 	chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
-	if (*chunk_size_arg == '\0' || *value != '\0') {
+	if (*chunk_size_arg == '\0' || *value != '\0' ||
+	    chunk_size_ulong > UINT_MAX) {
 		*error = "Invalid chunk size";
 		return -EINVAL;
 	}
@@ -171,34 +172,35 @@ static int set_chunk_size(struct dm_exception_store *store,
 	 */
 	chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9);
 
-	return dm_exception_store_set_chunk_size(store, chunk_size_ulong,
+	return dm_exception_store_set_chunk_size(store,
+						 (unsigned) chunk_size_ulong,
 						 error);
 }
 
 int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
-				      unsigned long chunk_size_ulong,
+				      unsigned chunk_size,
 				      char **error)
 {
 	/* Check chunk_size is a power of 2 */
-	if (!is_power_of_2(chunk_size_ulong)) {
+	if (!is_power_of_2(chunk_size)) {
 		*error = "Chunk size is not a power of 2";
 		return -EINVAL;
 	}
 
 	/* Validate the chunk size against the device block size */
-	if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
+	if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
 		*error = "Chunk size is not a multiple of device blocksize";
 		return -EINVAL;
 	}
 
-	if (chunk_size_ulong > INT_MAX >> SECTOR_SHIFT) {
+	if (chunk_size > INT_MAX >> SECTOR_SHIFT) {
 		*error = "Chunk size is too high";
 		return -EINVAL;
 	}
 
-	store->chunk_size = chunk_size_ulong;
-	store->chunk_mask = chunk_size_ulong - 1;
-	store->chunk_shift = ffs(chunk_size_ulong) - 1;
+	store->chunk_size = chunk_size;
+	store->chunk_mask = chunk_size - 1;
+	store->chunk_shift = ffs(chunk_size) - 1;
 
 	return 0;
 }
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 812c71872ba0..8a223a48802c 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -101,9 +101,9 @@ struct dm_exception_store {
 	struct dm_dev *cow;
 
 	/* Size of data blocks saved - must be a power of 2 */
-	chunk_t chunk_size;
-	chunk_t chunk_mask;
-	chunk_t chunk_shift;
+	unsigned chunk_size;
+	unsigned chunk_mask;
+	unsigned chunk_shift;
 
 	void *context;
 };
@@ -169,7 +169,7 @@ int dm_exception_store_type_register(struct dm_exception_store_type *type);
 int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
 
 int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
-				      unsigned long chunk_size_ulong,
+				      unsigned chunk_size,
 				      char **error);
 
 int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index d5b2e08750d5..0c746420c008 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -284,12 +284,13 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 {
 	int r;
 	struct disk_header *dh;
-	chunk_t chunk_size;
+	unsigned chunk_size;
 	int chunk_size_supplied = 1;
 	char *chunk_err;
 
 	/*
-	 * Use default chunk size (or hardsect_size, if larger) if none supplied
+	 * Use default chunk size (or logical_block_size, if larger)
+	 * if none supplied
 	 */
 	if (!ps->store->chunk_size) {
 		ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
@@ -334,10 +335,9 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 		return 0;
 
 	if (chunk_size_supplied)
-		DMWARN("chunk size %llu in device metadata overrides "
-		       "table chunk size of %llu.",
-		       (unsigned long long)chunk_size,
-		       (unsigned long long)ps->store->chunk_size);
+		DMWARN("chunk size %u in device metadata overrides "
+		       "table chunk size of %u.",
+		       chunk_size, ps->store->chunk_size);
 
 	/* We had a bogus chunk_size. Fix stuff up. */
 	free_area(ps);
@@ -345,8 +345,8 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	r = dm_exception_store_set_chunk_size(ps->store, chunk_size,
 					      &chunk_err);
 	if (r) {
-		DMERR("invalid on-disk chunk size %llu: %s.",
-		      (unsigned long long)chunk_size, chunk_err);
+		DMERR("invalid on-disk chunk size %u: %s.",
+		      chunk_size, chunk_err);
 		return r;
 	}
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9bc814aa2bbd..3a3ba46e6d4b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -961,7 +961,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 
 	src.bdev = bdev;
 	src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
-	src.count = min(s->store->chunk_size, dev_size - src.sector);
+	src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
 
 	dest.bdev = s->store->cow->bdev;
 	dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
@@ -1402,7 +1402,7 @@ static void origin_resume(struct dm_target *ti)
 	struct dm_dev *dev = ti->private;
 	struct dm_snapshot *snap;
 	struct origin *o;
-	chunk_t chunk_size = 0;
+	unsigned chunk_size = 0;
 
 	down_read(&_origins_lock);
 	o = __lookup_origin(dev->bdev);
-- 
cgit v1.2.2


From c1cc65caa19bb8a1b2e371000ef2719581db1691 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 16 Oct 2009 23:18:22 +0100
Subject: dm snapshot: allow chunk size to be less than page size

Allow the snapshot chunk size to be smaller than the page size
The code is now capable of handling this due to some previous
fixes and enhancements.

As the page size varies between computers, prior to this patch,
the chunk size of a snapshot dictated which machines could read it:
Snapshots created on one machine might not be readable on another.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 932d1b123143..7dbe652efb5a 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -138,16 +138,6 @@ int dm_exception_store_type_unregister(struct dm_exception_store_type *type)
 }
 EXPORT_SYMBOL(dm_exception_store_type_unregister);
 
-/*
- * Round a number up to the nearest 'size' boundary.  size must
- * be a power of 2.
- */
-static ulong round_up(ulong n, ulong size)
-{
-	size--;
-	return (n + size) & ~size;
-}
-
 static int set_chunk_size(struct dm_exception_store *store,
 			  const char *chunk_size_arg, char **error)
 {
@@ -166,12 +156,6 @@ static int set_chunk_size(struct dm_exception_store *store,
 		return 0;
 	}
 
-	/*
-	 * Chunk size must be multiple of page size.  Silently
-	 * round up if it's not.
-	 */
-	chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9);
-
 	return dm_exception_store_set_chunk_size(store,
 						 (unsigned) chunk_size_ulong,
 						 error);
-- 
cgit v1.2.2


From 6629542e79255e0dbef8ec82eaf644e1b2546c3c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 19 Oct 2009 18:09:32 -0700
Subject: md/raid6: kill a gcc-4.0.1 'uninitialized variable' warning

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/raid5.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dcd9e659ed9d..81abefc172d9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -156,16 +156,16 @@ static inline int raid6_next_disk(int disk, int raid_disks)
 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 			     int *count, int syndrome_disks)
 {
-	int slot;
+	int slot = *count;
 
 	if (sh->ddf_layout)
-		slot = (*count)++;
+		(*count)++;
 	if (idx == sh->pd_idx)
 		return syndrome_disks;
 	if (idx == sh->qd_idx)
 		return syndrome_disks + 1;
 	if (!sh->ddf_layout)
-		slot = (*count)++;
+		(*count)++;
 	return slot;
 }
 
-- 
cgit v1.2.2


From 24395a85d8efe6eee477ea35c73d045a8dd7a3a1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Nov 2009 14:59:27 +1100
Subject: md: don't clear endpoint for resync when resync is interrupted.

If a 'sync_max' has been set (via sysfs), it is wrong to clear it
until a resync (or reshape or recovery ...) actually reached that
point.
So if a resync is interrupted (e.g. by device failure),
leave 'resync_max' unchanged.

This is particularly important for 'reshape' operations that do not
change the size of the array.  For such operations mdadm needs to
monitor the reshape taking rolling backups of the section being
reshaped.  If resync_max gets cleared, the reshape can get ahead of
mdadm and then the backups that mdadm creates are useless.

This is suitable for 2.6.31.y stable kernels.
Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 10eb1fce975e..e64c971038d1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6504,8 +6504,9 @@ void md_do_sync(mddev_t *mddev)
  skip:
 	mddev->curr_resync = 0;
 	mddev->curr_resync_completed = 0;
-	mddev->resync_min = 0;
-	mddev->resync_max = MaxSector;
+	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+		/* We completed so max setting can be forgotten. */
+		mddev->resync_max = MaxSector;
 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	wake_up(&resync_wait);
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
-- 
cgit v1.2.2


From 8dee7211467a56b7eb4e4359efb0aa4a72e1b6f3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Nov 2009 14:59:29 +1100
Subject: md/raid5: make sure curr_sync_completes is uptodate when reshape
 starts

This value is visible through sysfs and is used by mdadm
when it manages a reshape (backing up data that is about to be
rearranged).  So it is important that it is always correct.
Current it does not get updated properly when a reshape
starts which can cause problems when assembling an array
that is in the middle of being reshaped.

This is suitable for 2.6.31.y stable kernels.

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 81abefc172d9..dcce204b6c73 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4049,6 +4049,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 			sector_nr = conf->reshape_progress;
 		sector_div(sector_nr, new_data_disks);
 		if (sector_nr) {
+			mddev->curr_resync_completed = sector_nr;
+			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 			*skipped = 1;
 			return sector_nr;
 		}
-- 
cgit v1.2.2


From 5e8651060cea6b44844521ddcac665e2c021f5d8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Nov 2009 12:08:04 +1100
Subject: md: factor out updating of 'recovery_offset'.

Each device has its own 'recovery_offset' showing how far
recovery has progressed on the device.
As the only real significance of this is that fact that it can
be stored in the metadata and recovered at restart, and as
only 1.x metadata can do this, we were only updating
'recovery_offset' to 'curr_resync_completed' when updating
v1.x metadata.
But this is wrong, and we will shortly make limited use of this
field in v0.90 metadata.

So move the update into common code.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e64c971038d1..01b9a0fd16e0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1382,8 +1382,6 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	if (rdev->raid_disk >= 0 &&
 	    !test_bit(In_sync, &rdev->flags)) {
-		if (mddev->curr_resync_completed > rdev->recovery_offset)
-			rdev->recovery_offset = mddev->curr_resync_completed;
 		if (rdev->recovery_offset > 0) {
 			sb->feature_map |=
 				cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
@@ -1917,6 +1915,14 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 	 */
 	mdk_rdev_t *rdev;
 
+	/* First make sure individual recovery_offsets are correct */
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (rdev->raid_disk >= 0 &&
+		    !test_bit(In_sync, &rdev->flags) &&
+		    mddev->curr_resync_completed > rdev->recovery_offset)
+				rdev->recovery_offset = mddev->curr_resync_completed;
+
+	}	
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->sb_events == mddev->events ||
 		    (nospares &&
-- 
cgit v1.2.2


From 0261cd9f1cb42fa44ece314d27868d83742bdf03 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 13 Nov 2009 17:40:48 +1100
Subject: md: allow v0.91 metadata to record devices as being active but not
 in-sync.

This is a combination that didn't really make sense before.
However when a reshape is converting e.g. raid5 -> raid6, the extra
device is not fully in-sync, but is certainly active and contains
important data.
So allow that start to be meaningful and in particular get
the 'recovery_offset' value (which is needed for any non-in-sync
active device) from the reshape_position.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 01b9a0fd16e0..b182f86a19dd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -944,6 +944,14 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			    desc->raid_disk < mddev->raid_disks */) {
 			set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = desc->raid_disk;
+		} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
+			/* active but not in sync implies recovery up to
+			 * reshape position.  We don't know exactly where
+			 * that is, so set to zero for now */
+			if (mddev->minor_version >= 91) {
+				rdev->recovery_offset = 0;
+				rdev->raid_disk = desc->raid_disk;
+			}
 		}
 		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
 			set_bit(WriteMostly, &rdev->flags);
@@ -1032,8 +1040,19 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	list_for_each_entry(rdev2, &mddev->disks, same_set) {
 		mdp_disk_t *d;
 		int desc_nr;
-		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
-		    && !test_bit(Faulty, &rdev2->flags))
+		int is_active = test_bit(In_sync, &rdev2->flags);
+
+		if (rdev2->raid_disk >= 0 &&
+		    sb->minor_version >= 91)
+			/* we have nowhere to store the recovery_offset,
+			 * but if it is not below the reshape_position,
+			 * we can piggy-back on that.
+			 */
+			is_active = 1;
+		if (rdev2->raid_disk < 0 ||
+		    test_bit(Faulty, &rdev2->flags))
+			is_active = 0;
+		if (is_active)
 			desc_nr = rdev2->raid_disk;
 		else
 			desc_nr = next_spare++;
@@ -1043,16 +1062,16 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		d->number = rdev2->desc_nr;
 		d->major = MAJOR(rdev2->bdev->bd_dev);
 		d->minor = MINOR(rdev2->bdev->bd_dev);
-		if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
-		    && !test_bit(Faulty, &rdev2->flags))
+		if (is_active)
 			d->raid_disk = rdev2->raid_disk;
 		else
 			d->raid_disk = rdev2->desc_nr; /* compatibility */
 		if (test_bit(Faulty, &rdev2->flags))
 			d->state = (1<<MD_DISK_FAULTY);
-		else if (test_bit(In_sync, &rdev2->flags)) {
+		else if (is_active) {
 			d->state = (1<<MD_DISK_ACTIVE);
-			d->state |= (1<<MD_DISK_SYNC);
+			if (test_bit(In_sync, &rdev2->flags))
+				d->state |= (1<<MD_DISK_SYNC);
 			active++;
 			working++;
 		} else {
-- 
cgit v1.2.2


From 7ef90146a14c2bb1de2e22399f147ebec5b74f0b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 13 Nov 2009 17:40:51 +1100
Subject: Don't unconditionally set in_sync on newly added device in
 raid5_reshape

When a reshape finds that it can add spare devices into the array,
those devices might already be 'in_sync' if they are beyond the old
size of the array, or they might not if they are within the array.

The first case happens when we change an N-drive RAID5 to an
N+1-drive RAID5.
The second happens when we convert an N-drive RAID5 to an
N+1-drive RAID6.

So set the flag more carefully.
Also, ->recovery_offset is only meaningful when the flag is clear,
so only set it in that case.

This change needs the preceding two to ensure that the non-in_sync
device doesn't get evicted from the array when it is stopped, in the
case where v0.90 metadata is used.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dcce204b6c73..ab40529bdabe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5361,9 +5361,11 @@ static int raid5_start_reshape(mddev_t *mddev)
 		    !test_bit(Faulty, &rdev->flags)) {
 			if (raid5_add_disk(mddev, rdev) == 0) {
 				char nm[20];
-				set_bit(In_sync, &rdev->flags);
+				if (rdev->raid_disk >= conf->previous_raid_disks)
+					set_bit(In_sync, &rdev->flags);
+				else
+					rdev->recovery_offset = 0;
 				added_devices++;
-				rdev->recovery_offset = 0;
 				sprintf(nm, "rd%d", rdev->raid_disk);
 				if (sysfs_create_link(&mddev->kobj,
 						      &rdev->kobj, nm))
-- 
cgit v1.2.2


From c148ffdcda00b6599b70f8b65e6a1fadd1dbb127 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 13 Nov 2009 17:47:00 +1100
Subject: md/raid5: Allow dirty-degraded arrays to be assembled when only party
 is degraded.

Normally is it not safe to allow a raid5 that is both dirty and
degraded to be assembled without explicit request from that admin, as
it can cause hidden data corruption.
This is because 'dirty' means that the parity cannot be trusted, and
'degraded' means that the parity needs to be used.

However, if the device that is missing contains only parity, then
there is no issue and assembly can continue.
This particularly applies when a RAID5 is being converted to a RAID6
and there is an unclean shutdown while the conversion is happening.

So check for whether the degraded space only contains parity, and
in that case, allow the assembly.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ab40529bdabe..d29215d966da 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4823,11 +4823,40 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 		return ERR_PTR(-ENOMEM);
 }
 
+
+static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
+{
+	switch (algo) {
+	case ALGORITHM_PARITY_0:
+		if (raid_disk < max_degraded)
+			return 1;
+		break;
+	case ALGORITHM_PARITY_N:
+		if (raid_disk >= raid_disks - max_degraded)
+			return 1;
+		break;
+	case ALGORITHM_PARITY_0_6:
+		if (raid_disk == 0 || 
+		    raid_disk == raid_disks - 1)
+			return 1;
+		break;
+	case ALGORITHM_LEFT_ASYMMETRIC_6:
+	case ALGORITHM_RIGHT_ASYMMETRIC_6:
+	case ALGORITHM_LEFT_SYMMETRIC_6:
+	case ALGORITHM_RIGHT_SYMMETRIC_6:
+		if (raid_disk == raid_disks - 1)
+			return 1;
+	}
+	return 0;
+}
+
 static int run(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
 	int working_disks = 0, chunk_size;
+	int dirty_parity_disks = 0;
 	mdk_rdev_t *rdev;
+	sector_t reshape_offset = 0;
 
 	if (mddev->recovery_cp != MaxSector)
 		printk(KERN_NOTICE "raid5: %s is not clean"
@@ -4861,6 +4890,7 @@ static int run(mddev_t *mddev)
 			       "on a stripe boundary\n");
 			return -EINVAL;
 		}
+		reshape_offset = here_new * mddev->new_chunk_sectors;
 		/* here_new is the stripe we will write to */
 		here_old = mddev->reshape_position;
 		sector_div(here_old, mddev->chunk_sectors *
@@ -4916,10 +4946,51 @@ static int run(mddev_t *mddev)
 	/*
 	 * 0 for a fully functional array, 1 or 2 for a degraded array.
 	 */
-	list_for_each_entry(rdev, &mddev->disks, same_set)
-		if (rdev->raid_disk >= 0 &&
-		    test_bit(In_sync, &rdev->flags))
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (rdev->raid_disk < 0)
+			continue;
+		if (test_bit(In_sync, &rdev->flags))
 			working_disks++;
+		/* This disc is not fully in-sync.  However if it
+		 * just stored parity (beyond the recovery_offset),
+		 * when we don't need to be concerned about the
+		 * array being dirty.
+		 * When reshape goes 'backwards', we never have
+		 * partially completed devices, so we only need
+		 * to worry about reshape going forwards.
+		 */
+		/* Hack because v0.91 doesn't store recovery_offset properly. */
+		if (mddev->major_version == 0 &&
+		    mddev->minor_version > 90)
+			rdev->recovery_offset = reshape_offset;
+			
+		printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n",
+		       rdev->raid_disk, working_disks, conf->prev_algo,
+		       conf->previous_raid_disks, conf->max_degraded,
+		       conf->algorithm, conf->raid_disks, 
+		       only_parity(rdev->raid_disk,
+				   conf->prev_algo,
+				   conf->previous_raid_disks,
+				   conf->max_degraded),
+		       only_parity(rdev->raid_disk,
+				   conf->algorithm,
+				   conf->raid_disks,
+				   conf->max_degraded));
+		if (rdev->recovery_offset < reshape_offset) {
+			/* We need to check old and new layout */
+			if (!only_parity(rdev->raid_disk,
+					 conf->algorithm,
+					 conf->raid_disks,
+					 conf->max_degraded))
+				continue;
+		}
+		if (!only_parity(rdev->raid_disk,
+				 conf->prev_algo,
+				 conf->previous_raid_disks,
+				 conf->max_degraded))
+			continue;
+		dirty_parity_disks++;
+	}
 
 	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
 			   - working_disks);
@@ -4935,7 +5006,7 @@ static int run(mddev_t *mddev)
 	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
 	mddev->resync_max_sectors = mddev->dev_sectors;
 
-	if (mddev->degraded > 0 &&
+	if (mddev->degraded > dirty_parity_disks &&
 	    mddev->recovery_cp != MaxSector) {
 		if (mddev->ok_start_degraded)
 			printk(KERN_WARNING
-- 
cgit v1.2.2