From 2214c260c72b0bd94e6c1c19bf451686212025d3 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Mon, 8 May 2017 11:56:55 +0200
Subject: md: don't return -EAGAIN in md_allow_write for external metadata
 arrays

This essentially reverts commit b5470dc5fc18 ("md: resolve external
metadata handling deadlock in md_allow_write") with some adjustments.

Since commit 6791875e2e53 ("md: make reconfig_mutex optional for writes
to md sysfs files.") changing array_state to 'active' does not use
mddev_lock() and will not cause a deadlock with md_allow_write(). This
revert simplifies userspace tools that write to sysfs attributes like
"stripe_cache_size" or "consistency_policy" because it removes the need
for special handling for external metadata arrays, checking for EAGAIN
and retrying the write.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7ed59351fe97..7c1f73398800 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3197,7 +3197,7 @@ static int raid1_reshape(struct mddev *mddev)
 	struct r1conf *conf = mddev->private;
 	int cnt, raid_disks;
 	unsigned long flags;
-	int d, d2, err;
+	int d, d2;
 
 	/* Cannot change chunk_size, layout, or level */
 	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3209,11 +3209,8 @@ static int raid1_reshape(struct mddev *mddev)
 		return -EINVAL;
 	}
 
-	if (!mddev_is_clustered(mddev)) {
-		err = md_allow_write(mddev);
-		if (err)
-			return err;
-	}
+	if (!mddev_is_clustered(mddev))
+		md_allow_write(mddev);
 
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 
-- 
cgit v1.2.2


From 23b245c04d0ef408087430dd4d1b214a5da1eb78 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 10 May 2017 08:47:11 -0700
Subject: md/raid1/10: avoid unnecessary locking

If we add bios to block plugging list, locking is unnecessry, since the block
unplug is guaranteed not to run at that time.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7c1f73398800..a17ed6218d51 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1529,17 +1529,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			plug = container_of(cb, struct raid1_plug_cb, cb);
 		else
 			plug = NULL;
-		spin_lock_irqsave(&conf->device_lock, flags);
 		if (plug) {
 			bio_list_add(&plug->pending, mbio);
 			plug->pending_cnt++;
 		} else {
+			spin_lock_irqsave(&conf->device_lock, flags);
 			bio_list_add(&conf->pending_bio_list, mbio);
 			conf->pending_count++;
-		}
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!plug)
+			spin_unlock_irqrestore(&conf->device_lock, flags);
 			md_wakeup_thread(mddev->thread);
+		}
 	}
 
 	r1_bio_write_done(r1_bio);
-- 
cgit v1.2.2


From d82dd0e34d0347be201fd274dc84cd645dccc064 Mon Sep 17 00:00:00 2001
From: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Date: Fri, 12 May 2017 14:26:10 +0200
Subject: raid1: prefer disk without bad blocks

If an array consists of two drives and the first drive has the bad
block, the read request to the region overlapping the bad block chooses
the same disk (with bad block) as device to read from over and over and
the request gets stuck. If the first disk only partially overlaps with
bad block, it becomes a candidate ("best disk") for shorter range of
sectors. The second disk is capable of reading the entire requested
range and it is updated accordingly, however it is not recorded as a
best device for the request. In the end the request is sent to the first
disk to read entire range of sectors. It fails and is re-tried in a
moment but with the same outcome.

Actually it is quite likely scenario but it had little exposure in my
test until commit 715d40b93b10 ("md/raid1: add failfast handling for
reads.") removed preference for idle disk. Such scenario had been
passing as second disk was always chosen when idle.

Reset a candidate ("best disk") to read from if disk can read entire
range. Do it only if other disk has already been chosen as a candidate
for a smaller range. The head position / disk type logic will select
the best disk to read from - it is fine as disk with bad block won't be
considered for it.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a17ed6218d51..af5056d56878 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 					break;
 			}
 			continue;
-		} else
+		} else {
+			if ((sectors > best_good_sectors) && (best_disk >= 0))
+				best_disk = -1;
 			best_good_sectors = sectors;
+		}
 
 		if (best_disk >= 0)
 			/* At least two disks to choose from so failfast is OK */
-- 
cgit v1.2.2