From 0232605d987d8230b254aa139805bbb56a7ca30c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 3 Jul 2012 15:56:52 +1000
Subject: md: make 'name' arg to md_register_thread non-optional.

Having the 'name' arg optional and defaulting to the current
personality name is no necessary and leads to errors, as when
changing the level of an array we can end up using the
name of the old level instead of the new one.

So make it non-optional and always explicitly pass the name
of the level that the array will be.

Reported-by: majianpeng <majianpeng@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a9c7981ddd24..39b2a8aa3b23 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2621,7 +2621,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		goto abort;
 	}
 	err = -ENOMEM;
-	conf->thread = md_register_thread(raid1d, mddev, NULL);
+	conf->thread = md_register_thread(raid1d, mddev, "raid1");
 	if (!conf->thread) {
 		printk(KERN_ERR
 		       "md/raid1:%s: couldn't allocate thread\n",
-- 
cgit v1.2.2


From 32644afd8975d19174bcb9ba34687c32dd810a09 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 3 Jul 2012 15:58:42 +1000
Subject: md/raid1: fix bug in read_balance introduced by hot-replace

When we added hot_replace we doubled the number of devices
that could be in a RAID1 array.  So we doubled how far read_balance
would search.  Unfortunately we didn't double the point at which
it looped back to the beginning - so it effectively loops over
all non-replacement disks twice.
This doesn't cause bad behaviour, but it pointless and means we
never read from replacement devices.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 39b2a8aa3b23..34b4665cb0b6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -517,8 +517,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		int bad_sectors;
 
 		int disk = start_disk + i;
-		if (disk >= conf->raid_disks)
-			disk -= conf->raid_disks;
+		if (disk >= conf->raid_disks * 2)
+			disk -= conf->raid_disks * 2;
 
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
-- 
cgit v1.2.2


From b357f04a67c2aeee828b240863cd3f21d6cb3179 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 3 Jul 2012 17:45:31 +1000
Subject: md: fix up plugging (again).

The value returned by "mddev_check_plug" is only valid until the
next 'schedule' as that will unplug things.  This could happen at any
call to mempool_alloc.
So just calling mddev_check_plug at the start doesn't really make
sense.

So call it just before, or just after, queuing things for the thread.
As the action that happens at unplug is to wake the thread, this makes
lots of sense.
If we cannot add a plug (which requires a small GFP_ATOMIC alloc) we
wake thread immediately.

RAID5 is a bit different.  Requests are queued for the thread and the
thread is woken by release_stripe.  So we don't need to wake the
thread on failure.
However the thread doesn't perform certain actions when there is any
active plug, so it is important to install a plug before waking the
thread.  So for RAID5 we install the plug *before* queuing the request
and waking the thread.

Without this patch it is possible for raid1 or raid10 to queue a
request without then waking the thread, resulting in the array locking
up.

Also change raid10 to only flush_pending_write when there are not
active plugs, just like raid1.

This patch is suitable for 3.0 or later.  I plan to submit it to
-stable, but I'll like to let it spend a few weeks in mainline
first to be sure it is completely safe.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 34b4665cb0b6..8c2754f835ef 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -883,7 +883,6 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	struct md_rdev *blocked_rdev;
-	int plugged;
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
@@ -1034,7 +1033,6 @@ read_again:
 	 * the bad blocks.  Each set of writes gets it's own r1bio
 	 * with a set of bios attached.
 	 */
-	plugged = mddev_check_plugged(mddev);
 
 	disks = conf->raid_disks * 2;
  retry_write:
@@ -1191,6 +1189,8 @@ read_again:
 		bio_list_add(&conf->pending_bio_list, mbio);
 		conf->pending_count++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
+		if (!mddev_check_plugged(mddev))
+			md_wakeup_thread(mddev->thread);
 	}
 	/* Mustn't call r1_bio_write_done before this next test,
 	 * as it could result in the bio being freed.
@@ -1213,9 +1213,6 @@ read_again:
 
 	/* In case raid1d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
-
-	if (do_sync || !bitmap || !plugged)
-		md_wakeup_thread(mddev->thread);
 }
 
 static void status(struct seq_file *seq, struct mddev *mddev)
-- 
cgit v1.2.2


From 2d4f4f3384d4ef4f7c571448e803a1ce721113d5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 9 Jul 2012 11:34:13 +1000
Subject: md/raid1: fix use-after-free bug in RAID1 data-check code.

This bug has been present ever since data-check was introduce
in 2.6.16.  However it would only fire if a data-check were
done on a degraded array, which was only possible if the array
has 3 or more devices.  This is certainly possible, but is quite
uncommon.

Since hot-replace was added in 3.3 it can happen more often as
the same condition can arise if not all possible replacements are
present.

The problem is that as soon as we submit the last read request, the
'r1_bio' structure could be freed at any time, so we really should
stop looking at it.  If the last device is being read from we will
stop looking at it.  However if the last device is not due to be read
from, we will still check the bio pointer in the r1_bio, but the
r1_bio might already be free.

So use the read_targets counter to make sure we stop looking for bios
to submit as soon as we have submitted them all.

This fix is suitable for any -stable kernel since 2.6.16.

Cc: stable@vger.kernel.org
Reported-by: Arnold Schulz <arnysch@gmx.net>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8c2754f835ef..240ff3125040 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2485,9 +2485,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 	 */
 	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 		atomic_set(&r1_bio->remaining, read_targets);
-		for (i = 0; i < conf->raid_disks * 2; i++) {
+		for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io == end_sync_read) {
+				read_targets--;
 				md_sync_acct(bio->bi_bdev, nr_sectors);
 				generic_make_request(bio);
 			}
-- 
cgit v1.2.2


From 58e94ae18478c08229626daece2fc108a4a23261 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 19 Jul 2012 15:59:18 +1000
Subject: md/raid1: close some possible races on write errors during resync

commit 4367af556133723d0f443e14ca8170d9447317cb
   md/raid1: clear bad-block record when write succeeds.

Added a 'reschedule_retry' call possibility at the end of
end_sync_write, but didn't add matching code at the end of
sync_request_write.  So if the writes complete very quickly, or
scheduling makes it seem that way, then we can miss rescheduling
the request and the resync could hang.

Also commit 73d5c38a9536142e062c35997b044e89166e063b
    md: avoid races when stopping resync.

Fix a race condition in this same code in end_sync_write but didn't
make the change in sync_request_write.

This patch updates sync_request_write to fix both of those.
Patch is suitable for 3.1 and later kernels.

Reported-by: Alexander Lyakas <alex.bolshoy@gmail.com>
Original-version-by: Alexander Lyakas <alex.bolshoy@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 240ff3125040..cacd008d6864 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1818,8 +1818,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
 		/* if we're here, all write(s) have completed, so clean up */
-		md_done_sync(mddev, r1_bio->sectors, 1);
-		put_buf(r1_bio);
+		int s = r1_bio->sectors;
+		if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+		    test_bit(R1BIO_WriteError, &r1_bio->state))
+			reschedule_retry(r1_bio);
+		else {
+			put_buf(r1_bio);
+			md_done_sync(mddev, s, 1);
+		}
 	}
 }
 
-- 
cgit v1.2.2


From 0eaf822cb3dfcf2a64b2d27f4f6219186adb2695 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 31 Jul 2012 10:03:52 +1000
Subject: MD RAID1: rename mirror_info structure

MD RAID1: Rename the structure 'mirror_info' to 'raid1_info'

The same structure name ('mirror_info') is used by raid10.  Each of these
structures are defined in there respective header files.  If dm-raid is
to support both RAID1 and RAID10, the header files will be included and
the structure names must not collide.  While only one of these structure
names needs to change, this patch adds consistency to the naming of the
structure.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index cacd008d6864..57d6abd497ef 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -873,7 +873,7 @@ do_sync_io:
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r1conf *conf = mddev->private;
-	struct mirror_info *mirror;
+	struct raid1_info *mirror;
 	struct r1bio *r1_bio;
 	struct bio *read_bio;
 	int i, disks;
@@ -1364,7 +1364,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct r1conf *conf = mddev->private;
 	int err = -EEXIST;
 	int mirror = 0;
-	struct mirror_info *p;
+	struct raid1_info *p;
 	int first = 0;
 	int last = conf->raid_disks - 1;
 	struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1433,7 +1433,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct r1conf *conf = mddev->private;
 	int err = 0;
 	int number = rdev->raid_disk;
-	struct mirror_info *p = conf->mirrors+ number;
+	struct raid1_info *p = conf->mirrors + number;
 
 	if (rdev != p->rdev)
 		p = conf->mirrors + conf->raid_disks + number;
@@ -2521,7 +2521,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 {
 	struct r1conf *conf;
 	int i;
-	struct mirror_info *disk;
+	struct raid1_info *disk;
 	struct md_rdev *rdev;
 	int err = -ENOMEM;
 
@@ -2529,7 +2529,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf)
 		goto abort;
 
-	conf->mirrors = kzalloc(sizeof(struct mirror_info)
+	conf->mirrors = kzalloc(sizeof(struct raid1_info)
 				* mddev->raid_disks * 2,
 				 GFP_KERNEL);
 	if (!conf->mirrors)
@@ -2798,7 +2798,7 @@ static int raid1_reshape(struct mddev *mddev)
 	 */
 	mempool_t *newpool, *oldpool;
 	struct pool_info *newpoolinfo;
-	struct mirror_info *newmirrors;
+	struct raid1_info *newmirrors;
 	struct r1conf *conf = mddev->private;
 	int cnt, raid_disks;
 	unsigned long flags;
@@ -2841,7 +2841,7 @@ static int raid1_reshape(struct mddev *mddev)
 		kfree(newpoolinfo);
 		return -ENOMEM;
 	}
-	newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
+	newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
 			     GFP_KERNEL);
 	if (!newmirrors) {
 		kfree(newpoolinfo);
-- 
cgit v1.2.2


From 473e87ce485ffcac041f7911b33f0b4cd4d6cf2b Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 31 Jul 2012 10:03:52 +1000
Subject: MD: Move macros from raid1*.h to raid1*.c

MD RAID1/RAID10: Move some macros from .h file to .c file

There are three macros (IO_BLOCKED,IO_MADE_GOOD,BIO_SPECIAL) which are defined
in both raid1.h and raid10.h.  They are only used in there respective .c files.
However, if we wish to make RAID10 accessible to the device-mapper RAID
target (dm-raid.c), then we need to move these macros into the .c files where
they are used so that they do not conflict with each other.

The macros from the two files are identical and could be moved into md.h, but
I chose to leave the duplication and have them remain in the personality
files.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 57d6abd497ef..d3d3568b4fb1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
  */
 #define	NR_RAID1_BIOS 256
 
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context.  So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+
 /* When there are this many requests queue to be written by
  * the raid1 thread, we become 'congested' to provide back-pressure
  * for writeback.
-- 
cgit v1.2.2


From be4d3280b17bc51f23ec6ebb345728f302f80a0c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Tue, 31 Jul 2012 10:03:53 +1000
Subject: md/raid1: make sequential read detection per disk based

Currently the sequential read detection is global wide. It's natural to make it
per disk based, which can improve the detection for concurrent multiple
sequential reads. And next patch will make SSD read balance not use distance
based algorithm, where this change help detect truly sequential read for SSD.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d3d3568b4fb1..fb96c0c2db40 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -497,9 +497,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	const sector_t this_sector = r1_bio->sector;
 	int sectors;
 	int best_good_sectors;
-	int start_disk;
 	int best_disk;
-	int i;
+	int disk;
 	sector_t best_dist;
 	struct md_rdev *rdev;
 	int choose_first;
@@ -517,23 +516,16 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	best_good_sectors = 0;
 
 	if (conf->mddev->recovery_cp < MaxSector &&
-	    (this_sector + sectors >= conf->next_resync)) {
+	    (this_sector + sectors >= conf->next_resync))
 		choose_first = 1;
-		start_disk = 0;
-	} else {
+	else
 		choose_first = 0;
-		start_disk = conf->last_used;
-	}
 
-	for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
+	for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
 		sector_t dist;
 		sector_t first_bad;
 		int bad_sectors;
 
-		int disk = start_disk + i;
-		if (disk >= conf->raid_disks * 2)
-			disk -= conf->raid_disks * 2;
-
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
 		    || rdev == NULL
@@ -594,7 +586,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		dist = abs(this_sector - conf->mirrors[disk].head_position);
 		if (choose_first
 		    /* Don't change to another disk for sequential reads */
-		    || conf->next_seq_sect == this_sector
+		    || conf->mirrors[disk].next_seq_sect == this_sector
 		    || dist == 0
 		    /* If device is idle, use it */
 		    || atomic_read(&rdev->nr_pending) == 0) {
@@ -620,8 +612,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 			goto retry;
 		}
 		sectors = best_good_sectors;
-		conf->next_seq_sect = this_sector + sectors;
-		conf->last_used = best_disk;
+		conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
 	}
 	rcu_read_unlock();
 	*max_sectors = sectors;
@@ -2599,7 +2590,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
 	err = -EIO;
-	conf->last_used = -1;
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 
 		disk = conf->mirrors + i;
@@ -2625,19 +2615,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 			if (disk->rdev &&
 			    (disk->rdev->saved_raid_disk < 0))
 				conf->fullsync = 1;
-		} else if (conf->last_used < 0)
-			/*
-			 * The first working device is used as a
-			 * starting point to read balancing.
-			 */
-			conf->last_used = i;
+		}
 	}
 
-	if (conf->last_used < 0) {
-		printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
-		       mdname(mddev));
-		goto abort;
-	}
 	err = -ENOMEM;
 	conf->thread = md_register_thread(raid1d, mddev, "raid1");
 	if (!conf->thread) {
@@ -2894,7 +2874,6 @@ static int raid1_reshape(struct mddev *mddev)
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 	mddev->delta_disks = 0;
 
-	conf->last_used = 0; /* just make sure it is in-range */
 	lower_barrier(conf);
 
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-- 
cgit v1.2.2


From 9dedf60313fa4dddfd5b9b226a0ef12a512bf9dc Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Tue, 31 Jul 2012 10:03:53 +1000
Subject: md/raid1: read balance chooses idlest disk for SSD

SSD hasn't spindle, distance between requests means nothing. And the original
distance based algorithm sometimes can cause severe performance issue for SSD
raid.

Considering two thread groups, one accesses file A, the other access file B.
The first group will access one disk and the second will access the other disk,
because requests are near from one group and far between groups. In this case,
read balance might keep one disk very busy but the other relative idle.  For
SSD, we should try best to distribute requests to as many disks as possible.
There isn't spindle move penality anyway.

With below patch, I can see more than 50% throughput improvement sometimes
depending on workloads.

The only exception is small requests can be merged to a big request which
typically can drive higher throughput for SSD too. Such small requests are
sequential reads. Unlike hard disk, sequential read which can't be merged (for
example direct IO, or read without readahead) can be ignored for SSD. Again
there is no spindle move penality. readahead dispatches small requests and such
requests can be merged.

Last patch can help detect sequential read well, at least if concurrent read
number isn't greater than raid disk number. In that case, distance based
algorithm doesn't work well too.

V2: For hard disk and SSD mixed raid, doesn't use distance based algorithm for
random IO too. This makes the algorithm generic for raid with SSD.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb96c0c2db40..d9869f25aa75 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -497,9 +497,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	const sector_t this_sector = r1_bio->sector;
 	int sectors;
 	int best_good_sectors;
-	int best_disk;
+	int best_disk, best_dist_disk, best_pending_disk;
+	int has_nonrot_disk;
 	int disk;
 	sector_t best_dist;
+	unsigned int min_pending;
 	struct md_rdev *rdev;
 	int choose_first;
 
@@ -512,8 +514,12 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
  retry:
 	sectors = r1_bio->sectors;
 	best_disk = -1;
+	best_dist_disk = -1;
 	best_dist = MaxSector;
+	best_pending_disk = -1;
+	min_pending = UINT_MAX;
 	best_good_sectors = 0;
+	has_nonrot_disk = 0;
 
 	if (conf->mddev->recovery_cp < MaxSector &&
 	    (this_sector + sectors >= conf->next_resync))
@@ -525,6 +531,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		sector_t dist;
 		sector_t first_bad;
 		int bad_sectors;
+		unsigned int pending;
 
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
@@ -583,22 +590,43 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		} else
 			best_good_sectors = sectors;
 
+		has_nonrot_disk |= blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+		pending = atomic_read(&rdev->nr_pending);
 		dist = abs(this_sector - conf->mirrors[disk].head_position);
 		if (choose_first
 		    /* Don't change to another disk for sequential reads */
 		    || conf->mirrors[disk].next_seq_sect == this_sector
 		    || dist == 0
 		    /* If device is idle, use it */
-		    || atomic_read(&rdev->nr_pending) == 0) {
+		    || pending == 0) {
 			best_disk = disk;
 			break;
 		}
+
+		if (min_pending > pending) {
+			min_pending = pending;
+			best_pending_disk = disk;
+		}
+
 		if (dist < best_dist) {
 			best_dist = dist;
-			best_disk = disk;
+			best_dist_disk = disk;
 		}
 	}
 
+	/*
+	 * If all disks are rotational, choose the closest disk. If any disk is
+	 * non-rotational, choose the disk with less pending request even the
+	 * disk is rotational, which might/might not be optimal for raids with
+	 * mixed ratation/non-rotational disks depending on workload.
+	 */
+	if (best_disk == -1) {
+		if (has_nonrot_disk)
+			best_disk = best_pending_disk;
+		else
+			best_disk = best_dist_disk;
+	}
+
 	if (best_disk >= 0) {
 		rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
 		if (!rdev)
-- 
cgit v1.2.2


From 12cee5a8a29e7263e39953f1d941f723c617ca5f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Tue, 31 Jul 2012 10:03:53 +1000
Subject: md/raid1: prevent merging too large request

For SSD, if request size exceeds specific value (optimal io size), request size
isn't important for bandwidth. In such condition, if making request size bigger
will cause some disks idle, the total throughput will actually drop. A good
example is doing a readahead in a two-disk raid1 setup.

So when should we split big requests? We absolutly don't want to split big
request to very small requests. Even in SSD, big request transfer is more
efficient. This patch only considers request with size above optimal io size.

If all disks are busy, is it worth doing a split? Say optimal io size is 16k,
two requests 32k and two disks. We can let each disk run one 32k request, or
split the requests to 4 16k requests and each disk runs two. It's hard to say
which case is better, depending on hardware.

So only consider case where there are idle disks. For readahead, split is
always better in this case. And in my test, below patch can improve > 30%
thoughput. Hmm, not 100%, because disk isn't 100% busy.

Such case can happen not just in readahead, for example, in directio. But I
suppose directio usually will have bigger IO depth and make all disks busy, so
I ignored it.

Note: if the raid uses any hard disk, we don't prevent merging. That will make
performace worse.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 7 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d9869f25aa75..7aa958ed2847 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -504,6 +504,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	unsigned int min_pending;
 	struct md_rdev *rdev;
 	int choose_first;
+	int choose_next_idle;
 
 	rcu_read_lock();
 	/*
@@ -520,6 +521,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	min_pending = UINT_MAX;
 	best_good_sectors = 0;
 	has_nonrot_disk = 0;
+	choose_next_idle = 0;
 
 	if (conf->mddev->recovery_cp < MaxSector &&
 	    (this_sector + sectors >= conf->next_resync))
@@ -532,6 +534,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		sector_t first_bad;
 		int bad_sectors;
 		unsigned int pending;
+		bool nonrot;
 
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
@@ -590,18 +593,52 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		} else
 			best_good_sectors = sectors;
 
-		has_nonrot_disk |= blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+		nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+		has_nonrot_disk |= nonrot;
 		pending = atomic_read(&rdev->nr_pending);
 		dist = abs(this_sector - conf->mirrors[disk].head_position);
-		if (choose_first
-		    /* Don't change to another disk for sequential reads */
-		    || conf->mirrors[disk].next_seq_sect == this_sector
-		    || dist == 0
-		    /* If device is idle, use it */
-		    || pending == 0) {
+		if (choose_first) {
 			best_disk = disk;
 			break;
 		}
+		/* Don't change to another disk for sequential reads */
+		if (conf->mirrors[disk].next_seq_sect == this_sector
+		    || dist == 0) {
+			int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
+			struct raid1_info *mirror = &conf->mirrors[disk];
+
+			best_disk = disk;
+			/*
+			 * If buffered sequential IO size exceeds optimal
+			 * iosize, check if there is idle disk. If yes, choose
+			 * the idle disk. read_balance could already choose an
+			 * idle disk before noticing it's a sequential IO in
+			 * this disk. This doesn't matter because this disk
+			 * will idle, next time it will be utilized after the
+			 * first disk has IO size exceeds optimal iosize. In
+			 * this way, iosize of the first disk will be optimal
+			 * iosize at least. iosize of the second disk might be
+			 * small, but not a big deal since when the second disk
+			 * starts IO, the first disk is likely still busy.
+			 */
+			if (nonrot && opt_iosize > 0 &&
+			    mirror->seq_start != MaxSector &&
+			    mirror->next_seq_sect > opt_iosize &&
+			    mirror->next_seq_sect - opt_iosize >=
+			    mirror->seq_start) {
+				choose_next_idle = 1;
+				continue;
+			}
+			break;
+		}
+		/* If device is idle, use it */
+		if (pending == 0) {
+			best_disk = disk;
+			break;
+		}
+
+		if (choose_next_idle)
+			continue;
 
 		if (min_pending > pending) {
 			min_pending = pending;
@@ -640,6 +677,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 			goto retry;
 		}
 		sectors = best_good_sectors;
+
+		if (conf->mirrors[best_disk].next_seq_sect != this_sector)
+			conf->mirrors[best_disk].seq_start = this_sector;
+
 		conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
 	}
 	rcu_read_unlock();
@@ -2605,6 +2646,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 			mddev->merge_check_needed = 1;
 
 		disk->head_position = 0;
+		disk->seq_start = MaxSector;
 	}
 	conf->raid_disks = mddev->raid_disks;
 	conf->mddev = mddev;
-- 
cgit v1.2.2


From b7219ccb33aa0df9949a60c68b5e9f712615e56f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Jul 2012 10:05:34 +1000
Subject: md/raid1: don't abort a resync on the first badblock.

If a resync of a RAID1 array with 2 devices finds a known bad block
one device it will neither read from, or write to, that device for
this block offset.
So there will be one read_target (The other device) and zero write
targets.
This condition causes md/raid1 to abort the resync assuming that it
has finished - without known bad blocks this would be true.

When there are no write targets because of the presence of bad blocks
we should only skip over the area covered by the bad block.
RAID10 already gets this right, raid1 doesn't.  Or didn't.

As this can cause a 'sync' to abort early and appear to have succeeded
it could lead to some data corruption, so it suitable for -stable.

Cc: stable@vger.kernel.org
Reported-by: Alexander Lyakas <alex.bolshoy@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7aa958ed2847..d2361b162de5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2502,7 +2502,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 		/* There is nowhere to write, so all non-sync
 		 * drives must be failed - so we are finished
 		 */
-		sector_t rv = max_sector - sector_nr;
+		sector_t rv;
+		if (min_bad > 0)
+			max_sector = sector_nr + min_bad;
+		rv = max_sector - sector_nr;
 		*skipped = 1;
 		put_buf(r1_bio);
 		return rv;
-- 
cgit v1.2.2


From d57368afe63b3b7b45ce6c2b8c5276417935be2f Mon Sep 17 00:00:00 2001
From: Alexander Lyakas <alex.bolshoy@gmail.com>
Date: Tue, 17 Jul 2012 13:17:55 +0300
Subject: md/RAID1: Add missing case for attempting to repair known bad blocks.

When doing resync or repair, attempt to correct bad blocks, according
to WriteErrorSeen policy

Signed-off-by: Alex Lyakas <alex.bolshoy@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d2361b162de5..197f62681db5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2445,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 				bio->bi_rw = READ;
 				bio->bi_end_io = end_sync_read;
 				read_targets++;
+			} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+				test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+				!test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
+				/*
+				 * The device is suitable for reading (InSync),
+				 * but has bad block(s) here. Let's try to correct them,
+				 * if we are doing resync or repair. Otherwise, leave
+				 * this device alone for this sync request.
+				 */
+				bio->bi_rw = WRITE;
+				bio->bi_end_io = end_sync_write;
+				write_targets++;
 			}
 		}
 		if (bio->bi_end_io) {
-- 
cgit v1.2.2


From 0021b7bc045e4b0b85d8c53614342aaf84ca96a5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 31 Jul 2012 09:08:14 +0200
Subject: md: remove plug_cnt feature of plugging.

This seemed like a good idea at the time, but after further thought I
cannot see it making a difference other than very occasionally and
testing to try to exercise the case it is most likely to help did not
show any performance difference by removing it.

So remove the counting of active plugs and allow 'pending writes' to
be activated at any time, not just when no plugs are active.

This is only relevant when there is a write-intent bitmap, and the
updating of the bitmap will likely introduce enough delay that
the single-threading of bitmap updates will be enough to collect large
numbers of updates together.

Removing this will make it easier to centralise the unplug code, and
will clear the other for other unplug enhancements which have a
measurable effect.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/raid1.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index cacd008d6864..36a8fc059ac3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2173,8 +2173,7 @@ static void raid1d(struct mddev *mddev)
 	blk_start_plug(&plug);
 	for (;;) {
 
-		if (atomic_read(&mddev->plug_cnt) == 0)
-			flush_pending_writes(conf);
+		flush_pending_writes(conf);
 
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
-- 
cgit v1.2.2


From f54a9d0e59c4bea3db733921ca9147612a6f292c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 2 Aug 2012 08:33:20 +1000
Subject: md/raid1: submit IO from originating thread instead of md thread.

queuing writes to the md thread means that all requests go through the
one processor which may not be able to keep up with very high request
rates.

So use the plugging infrastructure to submit all requests on unplug.
If a 'schedule' is needed, we fall back on the old approach of handing
the requests to the thread for it to handle.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 3 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 36a8fc059ac3..9f01870d031c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -870,6 +870,44 @@ do_sync_io:
 	pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 }
 
+struct raid1_plug_cb {
+	struct blk_plug_cb	cb;
+	struct bio_list		pending;
+	int			pending_cnt;
+};
+
+static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
+						  cb);
+	struct mddev *mddev = plug->cb.data;
+	struct r1conf *conf = mddev->private;
+	struct bio *bio;
+
+	if (from_schedule) {
+		spin_lock_irq(&conf->device_lock);
+		bio_list_merge(&conf->pending_bio_list, &plug->pending);
+		conf->pending_count += plug->pending_cnt;
+		spin_unlock_irq(&conf->device_lock);
+		md_wakeup_thread(mddev->thread);
+		kfree(plug);
+		return;
+	}
+
+	/* we aren't scheduling, so we can do the write-out directly. */
+	bio = bio_list_get(&plug->pending);
+	bitmap_unplug(mddev->bitmap);
+	wake_up(&conf->wait_barrier);
+
+	while (bio) { /* submit pending writes */
+		struct bio *next = bio->bi_next;
+		bio->bi_next = NULL;
+		generic_make_request(bio);
+		bio = next;
+	}
+	kfree(plug);
+}
+
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r1conf *conf = mddev->private;
@@ -883,6 +921,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	struct md_rdev *blocked_rdev;
+	struct blk_plug_cb *cb;
+	struct raid1_plug_cb *plug = NULL;
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
@@ -1185,11 +1225,22 @@ read_again:
 		mbio->bi_private = r1_bio;
 
 		atomic_inc(&r1_bio->remaining);
+
+		cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
+		if (cb)
+			plug = container_of(cb, struct raid1_plug_cb, cb);
+		else
+			plug = NULL;
 		spin_lock_irqsave(&conf->device_lock, flags);
-		bio_list_add(&conf->pending_bio_list, mbio);
-		conf->pending_count++;
+		if (plug) {
+			bio_list_add(&plug->pending, mbio);
+			plug->pending_cnt++;
+		} else {
+			bio_list_add(&conf->pending_bio_list, mbio);
+			conf->pending_count++;
+		}
 		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!mddev_check_plugged(mddev))
+		if (!plug)
 			md_wakeup_thread(mddev->thread);
 	}
 	/* Mustn't call r1_bio_write_done before this next test,
-- 
cgit v1.2.2