From 874d2f61d31e596c36af7732dc1b3aa2dc233824 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Tue, 30 Jun 2009 15:18:14 +0100
Subject: dm exception store: really fix type lookup

Fix exception store name handling.

We need to reference exception store by zero terminated string.

Fixes regression introduced in commit f6bd4eb73cdf2a5bf954e497972842f39cabb7e3

Cc: Yi Yang <yi.y.yang@intel.com>
Cc: Jonathan Brassow <jbrassow@redhat.com>
Cc: stable@kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index c3ae51584b12..3710ff88fc10 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -195,7 +195,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 			      struct dm_exception_store **store)
 {
 	int r = 0;
-	struct dm_exception_store_type *type;
+	struct dm_exception_store_type *type = NULL;
 	struct dm_exception_store *tmp_store;
 	char persistent;
 
@@ -211,12 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 	}
 
 	persistent = toupper(*argv[1]);
-	if (persistent != 'P' && persistent != 'N') {
+	if (persistent == 'P')
+		type = get_type("P");
+	else if (persistent == 'N')
+		type = get_type("N");
+	else {
 		ti->error = "Persistent flag is not P or N";
 		return -EINVAL;
 	}
 
-	type = get_type(&persistent);
 	if (!type) {
 		ti->error = "Exception store type not recognised";
 		r = -EINVAL;
-- 
cgit v1.2.2


From ea9df47cc92573b159ef3b4fda516c32cba9c4fd Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 30 Jun 2009 15:18:17 +0100
Subject: dm table: fix blk_stack_limits arg to use bytes not sectors

The offset passed to blk_stack_limits() must be in bytes not sectors.
Fixes false warnings like the following:
device-mapper: table: 254:1: target device sda6 is misaligned

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reported-by: Frans Pop <elendil@planet.nl>
Tested-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4899ebe767c8..2cba557d9e61 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -495,7 +495,7 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 		return 0;
 	}
 
-	if (blk_stack_limits(limits, &q->limits, start) < 0)
+	if (blk_stack_limits(limits, &q->limits, start << 9) < 0)
 		DMWARN("%s: target device %s is misaligned",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
 
-- 
cgit v1.2.2


From 8f6c2e4b325a8e9f8f47febb2fd0ed4fae7d45a9 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 1 Jul 2009 11:13:45 +1000
Subject: md: Use new topology calls to indicate alignment and I/O sizes

Switch MD over to the new disk_stack_limits() function which checks for
aligment and adjusts preferred I/O sizes when stacking.

Also indicate preferred I/O sizes where applicable.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c    |  4 ++--
 drivers/md/multipath.c |  7 ++++---
 drivers/md/raid0.c     |  9 +++++++--
 drivers/md/raid1.c     |  9 ++++-----
 drivers/md/raid10.c    | 19 +++++++++++++------
 drivers/md/raid5.c     | 10 +++++++++-
 6 files changed, 39 insertions(+), 19 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 15c8b7b25a9b..5810fa906af0 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -166,8 +166,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 			rdev->sectors = sectors * mddev->chunk_sectors;
 		}
 
-		blk_queue_stack_limits(mddev->queue,
-				       rdev->bdev->bd_disk->queue);
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
 		/* as we don't honour merge_bvec_fn, we must never risk
 		 * violating it, so limit ->max_sector to one PAGE, as
 		 * a one page request is never in violation.
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index cbe368fa6598..237fe3fd235c 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -294,7 +294,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	for (path = first; path <= last; path++)
 		if ((p=conf->multipaths+path)->rdev == NULL) {
 			q = rdev->bdev->bd_disk->queue;
-			blk_queue_stack_limits(mddev->queue, q);
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
 
 		/* as we don't honour merge_bvec_fn, we must never risk
 		 * violating it, so limit ->max_sector to one PAGE, as
@@ -463,9 +464,9 @@ static int multipath_run (mddev_t *mddev)
 
 		disk = conf->multipaths + disk_idx;
 		disk->rdev = rdev;
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
 
-		blk_queue_stack_limits(mddev->queue,
-				       rdev->bdev->bd_disk->queue);
 		/* as we don't honour merge_bvec_fn, we must never risk
 		 * violating it, not that we ever expect a device with
 		 * a merge_bvec_fn to be involved in multipath */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ab4a489d8695..335f490dcad6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -170,8 +170,8 @@ static int create_strip_zones(mddev_t *mddev)
 		}
 		dev[j] = rdev1;
 
-		blk_queue_stack_limits(mddev->queue,
-				       rdev1->bdev->bd_disk->queue);
+		disk_stack_limits(mddev->gendisk, rdev1->bdev,
+				  rdev1->data_offset << 9);
 		/* as we don't honour merge_bvec_fn, we must never risk
 		 * violating it, so limit ->max_sector to one PAGE, as
 		 * a one page request is never in violation.
@@ -250,6 +250,11 @@ static int create_strip_zones(mddev_t *mddev)
 		       mddev->chunk_sectors << 9);
 		goto abort;
 	}
+
+	blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+	blk_queue_io_opt(mddev->queue,
+			 (mddev->chunk_sectors << 9) * mddev->raid_disks);
+
 	printk(KERN_INFO "raid0: done.\n");
 	mddev->private = conf;
 	return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 89939a7aef57..0569efba0c02 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1123,8 +1123,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	for (mirror = first; mirror <= last; mirror++)
 		if ( !(p=conf->mirrors+mirror)->rdev) {
 
-			blk_queue_stack_limits(mddev->queue,
-					       rdev->bdev->bd_disk->queue);
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
 			/* as we don't honour merge_bvec_fn, we must never risk
 			 * violating it, so limit ->max_sector to one PAGE, as
 			 * a one page request is never in violation.
@@ -1988,9 +1988,8 @@ static int run(mddev_t *mddev)
 		disk = conf->mirrors + disk_idx;
 
 		disk->rdev = rdev;
-
-		blk_queue_stack_limits(mddev->queue,
-				       rdev->bdev->bd_disk->queue);
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
 		/* as we don't honour merge_bvec_fn, we must never risk
 		 * violating it, so limit ->max_sector to one PAGE, as
 		 * a one page request is never in violation.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ae12ceafe10c..7298a5e5a183 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1151,8 +1151,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	for ( ; mirror <= last ; mirror++)
 		if ( !(p=conf->mirrors+mirror)->rdev) {
 
-			blk_queue_stack_limits(mddev->queue,
-					       rdev->bdev->bd_disk->queue);
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
 			/* as we don't honour merge_bvec_fn, we must never risk
 			 * violating it, so limit ->max_sector to one PAGE, as
 			 * a one page request is never in violation.
@@ -2044,7 +2044,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 static int run(mddev_t *mddev)
 {
 	conf_t *conf;
-	int i, disk_idx;
+	int i, disk_idx, chunk_size;
 	mirror_info_t *disk;
 	mdk_rdev_t *rdev;
 	int nc, fc, fo;
@@ -2130,6 +2130,14 @@ static int run(mddev_t *mddev)
 	spin_lock_init(&conf->device_lock);
 	mddev->queue->queue_lock = &conf->device_lock;
 
+	chunk_size = mddev->chunk_sectors << 9;
+	blk_queue_io_min(mddev->queue, chunk_size);
+	if (conf->raid_disks % conf->near_copies)
+		blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
+	else
+		blk_queue_io_opt(mddev->queue, chunk_size *
+				 (conf->raid_disks / conf->near_copies));
+
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= mddev->raid_disks
@@ -2138,9 +2146,8 @@ static int run(mddev_t *mddev)
 		disk = conf->mirrors + disk_idx;
 
 		disk->rdev = rdev;
-
-		blk_queue_stack_limits(mddev->queue,
-				       rdev->bdev->bd_disk->queue);
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
 		/* as we don't honour merge_bvec_fn, we must never risk
 		 * violating it, so limit ->max_sector to one PAGE, as
 		 * a one page request is never in violation.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f9f991e6e138..92ef9b6abfc7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4452,7 +4452,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 static int run(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
-	int working_disks = 0;
+	int working_disks = 0, chunk_size;
 	mdk_rdev_t *rdev;
 
 	if (mddev->recovery_cp != MaxSector)
@@ -4607,6 +4607,14 @@ static int run(mddev_t *mddev)
 	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
 	blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
+	chunk_size = mddev->chunk_sectors << 9;
+	blk_queue_io_min(mddev->queue, chunk_size);
+	blk_queue_io_opt(mddev->queue, chunk_size *
+			 (conf->raid_disks - conf->max_degraded));
+
+	list_for_each_entry(rdev, &mddev->disks, same_set)
+		disk_stack_limits(mddev->gendisk, rdev->bdev,
+				  rdev->data_offset << 9);
 
 	return 0;
 abort:
-- 
cgit v1.2.2


From b8d966efd9a46a9a35beac50cbff6e30565125ef Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 1 Jul 2009 11:14:04 +1000
Subject: md: avoid dereferencing NULL pointer when accessing suspend_* sysfs
 attributes.

If we try to modify one of the md/ sysfs files
  suspend_lo or suspend_hi
when the array is not active, we dereference a NULL.
Protect against that.

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 09be637d52cb..2166af8a7654 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3573,7 +3573,8 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
 	char *e;
 	unsigned long long new = simple_strtoull(buf, &e, 10);
 
-	if (mddev->pers->quiesce == NULL)
+	if (mddev->pers == NULL || 
+	    mddev->pers->quiesce == NULL)
 		return -EINVAL;
 	if (buf == e || (*e && *e != '\n'))
 		return -EINVAL;
@@ -3601,7 +3602,8 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
 	char *e;
 	unsigned long long new = simple_strtoull(buf, &e, 10);
 
-	if (mddev->pers->quiesce == NULL)
+	if (mddev->pers == NULL ||
+	    mddev->pers->quiesce == NULL)
 		return -EINVAL;
 	if (buf == e || (*e && *e != '\n'))
 		return -EINVAL;
-- 
cgit v1.2.2


From 1ec22eb2b4a2e1a763106bce36b11c02eaa84e61 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 1 Jul 2009 12:27:21 +1000
Subject: md: fix error path when duplicate name is found on md device
 creation.

When an md device is created by name (rather than number) we need to
check that the name is not already in use.  If this check finds a
duplicate, we return an error without dropping the lock or freeing
the newly create mddev.
This patch fixes that.

Cc: stable@kernel.org
Found-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2166af8a7654..58bee2366ea8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3862,6 +3862,8 @@ static int md_alloc(dev_t dev, char *name)
 			if (mddev2->gendisk &&
 			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
 				spin_unlock(&all_mddevs_lock);
+				mutex_unlock(&disks_mutex);
+				mddev_put(mddev);
 				return -EEXIST;
 			}
 		spin_unlock(&all_mddevs_lock);
-- 
cgit v1.2.2


From 0909dc448c98ed5021c87ffdfc09fb473aa464ab Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 1 Jul 2009 12:27:21 +1000
Subject: md: tidy up error paths in md_alloc

As the recent bug in md_alloc showed, having a single exit path for
unlocking and putting is a good idea.  So restructure md_alloc to have
a single mutex_unlock and mddev_put, and use gotos where necessary.

Found-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 58bee2366ea8..65fe35b5e34a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3846,11 +3846,9 @@ static int md_alloc(dev_t dev, char *name)
 	flush_scheduled_work();
 
 	mutex_lock(&disks_mutex);
-	if (mddev->gendisk) {
-		mutex_unlock(&disks_mutex);
-		mddev_put(mddev);
-		return -EEXIST;
-	}
+	error = -EEXIST;
+	if (mddev->gendisk)
+		goto abort;
 
 	if (name) {
 		/* Need to ensure that 'name' is not a duplicate.
@@ -3862,19 +3860,15 @@ static int md_alloc(dev_t dev, char *name)
 			if (mddev2->gendisk &&
 			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
 				spin_unlock(&all_mddevs_lock);
-				mutex_unlock(&disks_mutex);
-				mddev_put(mddev);
-				return -EEXIST;
+				goto abort;
 			}
 		spin_unlock(&all_mddevs_lock);
 	}
 
+	error = -ENOMEM;
 	mddev->queue = blk_alloc_queue(GFP_KERNEL);
-	if (!mddev->queue) {
-		mutex_unlock(&disks_mutex);
-		mddev_put(mddev);
-		return -ENOMEM;
-	}
+	if (!mddev->queue)
+		goto abort;
 	mddev->queue->queuedata = mddev;
 
 	/* Can be unlocked because the queue is new: no concurrency */
@@ -3884,11 +3878,9 @@ static int md_alloc(dev_t dev, char *name)
 
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
-		mutex_unlock(&disks_mutex);
 		blk_cleanup_queue(mddev->queue);
 		mddev->queue = NULL;
-		mddev_put(mddev);
-		return -ENOMEM;
+		goto abort;
 	}
 	disk->major = MAJOR(mddev->unit);
 	disk->first_minor = unit << shift;
@@ -3910,16 +3902,22 @@ static int md_alloc(dev_t dev, char *name)
 	mddev->gendisk = disk;
 	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
 				     &disk_to_dev(disk)->kobj, "%s", "md");
-	mutex_unlock(&disks_mutex);
-	if (error)
+	if (error) {
+		/* This isn't possible, but as kobject_init_and_add is marked
+		 * __must_check, we must do something with the result
+		 */
 		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
 		       disk->disk_name);
-	else {
+		error = 0;
+	}
+ abort:
+	mutex_unlock(&disks_mutex);
+	if (!error) {
 		kobject_uevent(&mddev->kobj, KOBJ_ADD);
 		mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
 	}
 	mddev_put(mddev);
-	return 0;
+	return error;
 }
 
 static struct kobject *md_probe(dev_t dev, int *part, void *data)
-- 
cgit v1.2.2


From a5c308d4d1659b1f4833b863394e3e24cdbdfc6e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 1 Jul 2009 13:15:35 +1000
Subject: md/raid5: suspend shouldn't affect read requests.

md allows write to regions on an array to be suspended temporarily.
This allows user-space to participate is aspects of reshape.
In particular, data can be copied with not risk of a race.
We should not be blocking read requests though, so don't.

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 92ef9b6abfc7..1f444ae07f89 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3702,7 +3702,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
 			/* FIXME what if we get a false positive because these
 			 * are being updated.
 			 */
-			if (logical_sector >= mddev->suspend_lo &&
+			if (bio_data_dir(bi) == WRITE &&
+			    logical_sector >= mddev->suspend_lo &&
 			    logical_sector < mddev->suspend_hi) {
 				release_stripe(sh);
 				schedule();
-- 
cgit v1.2.2


From e62e58a5ffdc98ac28d8dbd070c857620d541f99 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 1 Jul 2009 13:15:35 +1000
Subject: md: use interruptible wait when duration is controlled by userspace.

User space can set various limits on an md array so that resync waits
when it gets to a certain point, or so that I/O is blocked for a short
while.
When md is waiting against one of these limit, it should use an
interruptible wait so as not to add to the load average, and so are
not to trigger a warning if the wait goes on for too long.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 14 ++++++++++----
 drivers/md/raid5.c | 15 +++++++++++----
 2 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 65fe35b5e34a..0f4a70c43ffc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6336,10 +6336,16 @@ void md_do_sync(mddev_t *mddev)
 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 		}
 
-		if (j >= mddev->resync_max)
-			wait_event(mddev->recovery_wait,
-				   mddev->resync_max > j
-				   || kthread_should_stop());
+		while (j >= mddev->resync_max && !kthread_should_stop()) {
+			/* As this condition is controlled by user-space,
+			 * we can block indefinitely, so use '_interruptible'
+			 * to avoid triggering warnings.
+			 */
+			flush_signals(current); /* just in case */
+			wait_event_interruptible(mddev->recovery_wait,
+						 mddev->resync_max > j
+						 || kthread_should_stop());
+		}
 
 		if (kthread_should_stop())
 			goto interrupted;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1f444ae07f89..37835538b58e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3699,14 +3699,21 @@ static int make_request(struct request_queue *q, struct bio * bi)
 					goto retry;
 				}
 			}
-			/* FIXME what if we get a false positive because these
-			 * are being updated.
-			 */
+
 			if (bio_data_dir(bi) == WRITE &&
 			    logical_sector >= mddev->suspend_lo &&
 			    logical_sector < mddev->suspend_hi) {
 				release_stripe(sh);
-				schedule();
+				/* As the suspend_* range is controlled by
+				 * userspace, we want an interruptible
+				 * wait.
+				 */
+				flush_signals(current);
+				prepare_to_wait(&conf->wait_for_overlap,
+						&w, TASK_INTERRUPTIBLE);
+				if (logical_sector >= mddev->suspend_lo &&
+				    logical_sector < mddev->suspend_hi)
+					schedule();
 				goto retry;
 			}
 
-- 
cgit v1.2.2


From 7878cba9f0037f5599004b03a1260b32d9050360 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 26 Jun 2009 15:37:49 +0200
Subject: block: Create bip slabs with embedded integrity vectors

This patch restores stacking ability to the block layer integrity
infrastructure by creating a set of dedicated bip slabs.  Each bip slab
has an embedded bio_vec array at the end.  This cuts down on memory
allocations and also simplifies the code compared to the original bvec
version.  Only the largest bip slab is backed by a mempool.  The pool is
contained in the bio_set so stacking drivers can ensure forward
progress.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.(none)>
---
 drivers/md/dm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3c6d4ee8921d..9acd54a5cffb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1017,7 +1017,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 	clone->bi_flags |= 1 << BIO_CLONED;
 
 	if (bio_integrity(bio)) {
-		bio_integrity_clone(clone, bio, GFP_NOIO);
+		bio_integrity_clone(clone, bio, GFP_NOIO, bs);
 		bio_integrity_trim(clone,
 				   bio_sector_offset(bio, idx, offset), len);
 	}
@@ -1045,7 +1045,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
 
 	if (bio_integrity(bio)) {
-		bio_integrity_clone(clone, bio, GFP_NOIO);
+		bio_integrity_clone(clone, bio, GFP_NOIO, bs);
 
 		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
 			bio_integrity_trim(clone,
-- 
cgit v1.2.2


From ad361c9884e809340f6daca80d56a9e9c871690a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 6 Jul 2009 13:05:40 -0700
Subject: Remove multiple KERN_ prefixes from printk formats

Commit 5fd29d6ccbc98884569d6f3105aeca70858b3e0f ("printk: clean up
handling of log-levels and newlines") changed printk semantics.  printk
lines with multiple KERN_<level> prefixes are no longer emitted as
before the patch.

<level> is now included in the output on each additional use.

Remove all uses of multiple KERN_<level>s in formats.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0f4a70c43ffc..d4351ff0849f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1756,9 +1756,10 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
 	__u8 *uuid;
 
 	uuid = sb->set_uuid;
-	printk(KERN_INFO "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
-			":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
-	       KERN_INFO "md:    Name: \"%s\" CT:%llu\n",
+	printk(KERN_INFO
+	       "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
+	       ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
+	       "md:    Name: \"%s\" CT:%llu\n",
 		le32_to_cpu(sb->major_version),
 		le32_to_cpu(sb->feature_map),
 		uuid[0], uuid[1], uuid[2], uuid[3],
@@ -1770,12 +1771,13 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
 		       & MD_SUPERBLOCK_1_TIME_SEC_MASK);
 
 	uuid = sb->device_uuid;
-	printk(KERN_INFO "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
+	printk(KERN_INFO
+	       "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
 			" RO:%llu\n"
-	       KERN_INFO "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
-			":%02x%02x%02x%02x%02x%02x\n"
-	       KERN_INFO "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
-	       KERN_INFO "md:         (MaxDev:%u) \n",
+	       "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
+	                ":%02x%02x%02x%02x%02x%02x\n"
+	       "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
+	       "md:         (MaxDev:%u) \n",
 		le32_to_cpu(sb->level),
 		(unsigned long long)le64_to_cpu(sb->size),
 		le32_to_cpu(sb->raid_disks),
-- 
cgit v1.2.2


From 8aa7e847d834ed937a9ad37a0f2ad5b8584c1ab0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 9 Jul 2009 14:52:32 +0200
Subject: Fix congestion_wait() sync/async vs read/write confusion

Commit 1faa16d22877f4839bd433547d770c676d1d964c accidentally broke
the bdi congestion wait queue logic, causing us to wait on congestion
for WRITE (== 1) when we really wanted BLK_RW_ASYNC (== 0) instead.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/md/dm-crypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9933eb861c71..529e2ba505c3 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -776,7 +776,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		 * But don't wait if split was due to the io size restriction
 		 */
 		if (unlikely(out_of_pages))
-			congestion_wait(WRITE, HZ/100);
+			congestion_wait(BLK_RW_ASYNC, HZ/100);
 
 		/*
 		 * With async crypto it is unsafe to share the crypto context
-- 
cgit v1.2.2


From 69885683d22d8c05910fd808c01fdce1322739b4 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 23 Jul 2009 20:30:37 +0100
Subject: dm raid1: wake kmirrord when requeueing delayed bios after remote
 recovery

The recent commit 7513c2a761d69d2a93f17146b3563527d3618ba0 (dm raid1:
add is_remote_recovering hook for clusters) changed do_writes() to
update the ms->writes list but forgot to wake up kmirrord to process it.

The rule is that when anything is being added on ms->reads, ms->writes
or ms->failures and the list was empty before we must call
wakeup_mirrord (for immediate processing) or delayed_wake (for delayed
processing).  Otherwise the bios could sit on the list indefinitely.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
CC: stable@kernel.org
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ce8868c768cc..42a3e4341d60 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -638,6 +638,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		spin_lock_irq(&ms->lock);
 		bio_list_merge(&ms->writes, &requeue);
 		spin_unlock_irq(&ms->lock);
+		delayed_wake(ms);
 	}
 
 	/*
-- 
cgit v1.2.2


From a732c207d19e899845ae47139708af898daaf9fd Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 23 Jul 2009 20:30:40 +0100
Subject: dm: remove queue next_ordered workaround for barriers

This patch removes DM's bio-based vs request-based conditional setting
of next_ordered.  For bio-based DM the next_ordered check is no longer a
concern (as that check is now in the __make_request path).  For
request-based DM the default of QUEUE_ORDERED_NONE is now appropriate.

bio-based DM was changed to work-around the previously misplaced
next_ordered check with this commit:
99360b4c18f7675b50d283301d46d755affe75fd

request-based DM does not yet support barriers but reacted to the above
bio-based DM change with this commit:
5d67aa2366ccb8257d103d0b43df855605c3c086

The above changes are no longer needed given Neil Brown's recent fix to
put the next_ordered check in the __make_request path:
db64f680ba4b5c56c4be59f0698000df89ff0281

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: NeilBrown <neilb@suse.de>
Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c |  5 -----
 drivers/md/dm.c       | 10 ----------
 drivers/md/dm.h       |  1 -
 3 files changed, 16 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2cba557d9e61..5d16d06613aa 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -830,11 +830,6 @@ unsigned dm_table_get_type(struct dm_table *t)
 	return t->type;
 }
 
-bool dm_table_bio_based(struct dm_table *t)
-{
-	return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
-}
-
 bool dm_table_request_based(struct dm_table *t)
 {
 	return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9acd54a5cffb..8a311ea0d441 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2203,16 +2203,6 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 		goto out;
 	}
 
-	/*
-	 * It is enought that blk_queue_ordered() is called only once when
-	 * the first bio-based table is bound.
-	 *
-	 * This setting should be moved to alloc_dev() when request-based dm
-	 * supports barrier.
-	 */
-	if (!md->map && dm_table_bio_based(table))
-		blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
-
 	__unbind(md);
 	r = __bind(md, table, &limits);
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 23278ae80f08..a7663eba17e2 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -61,7 +61,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 int dm_table_any_busy_target(struct dm_table *t);
 int dm_table_set_type(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
-bool dm_table_bio_based(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
 int dm_table_alloc_md_mempools(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
-- 
cgit v1.2.2


From 5dea271b6d87bd1d79a59c1d5baac2596a841c37 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 23 Jul 2009 20:30:42 +0100
Subject: dm table: pass correct dev area size to device_area_is_valid

Incorrect device area lengths are being passed to device_area_is_valid().

The regression appeared in 2.6.31-rc1 through commit
754c5fc7ebb417b23601a6222a6005cc2e7f2913.

With the dm-stripe target, the size of the target (ti->len) was used
instead of the stripe_width (ti->len/#stripes).  An example of a
consequent incorrect error message is:

  device-mapper: table: 254:0: sdb too small for target

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c  |  2 +-
 drivers/md/dm-delay.c  |  4 ++--
 drivers/md/dm-linear.c |  2 +-
 drivers/md/dm-mpath.c  |  2 +-
 drivers/md/dm-raid1.c  |  2 +-
 drivers/md/dm-stripe.c |  7 ++++---
 drivers/md/dm-table.c  | 10 +++++-----
 7 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 529e2ba505c3..ed1038164019 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1318,7 +1318,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 {
 	struct crypt_config *cc = ti->private;
 
-	return fn(ti, cc->dev, cc->start, data);
+	return fn(ti, cc->dev, cc->start, ti->len, data);
 }
 
 static struct target_type crypt_target = {
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 4e5b843cd4d7..ebe7381f47c8 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -324,12 +324,12 @@ static int delay_iterate_devices(struct dm_target *ti,
 	struct delay_c *dc = ti->private;
 	int ret = 0;
 
-	ret = fn(ti, dc->dev_read, dc->start_read, data);
+	ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data);
 	if (ret)
 		goto out;
 
 	if (dc->dev_write)
-		ret = fn(ti, dc->dev_write, dc->start_write, data);
+		ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data);
 
 out:
 	return ret;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9184b6deb868..82f7d6e6b1ea 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -139,7 +139,7 @@ static int linear_iterate_devices(struct dm_target *ti,
 {
 	struct linear_c *lc = ti->private;
 
-	return fn(ti, lc->dev, lc->start, data);
+	return fn(ti, lc->dev, lc->start, ti->len, data);
 }
 
 static struct target_type linear_target = {
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c70604a20897..6f0d90d4a541 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1453,7 +1453,7 @@ static int multipath_iterate_devices(struct dm_target *ti,
 
 	list_for_each_entry(pg, &m->priority_groups, list) {
 		list_for_each_entry(p, &pg->pgpaths, list) {
-			ret = fn(ti, p->path.dev, ti->begin, data);
+			ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
 			if (ret)
 				goto out;
 		}
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 42a3e4341d60..9726577cde49 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1293,7 +1293,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
 	for (i = 0; !ret && i < ms->nr_mirrors; i++)
 		ret = fn(ti, ms->mirror[i].dev,
-			 ms->mirror[i].offset, data);
+			 ms->mirror[i].offset, ti->len, data);
 
 	return ret;
 }
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index b240e85ae39a..4e0e5937e42a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -320,10 +320,11 @@ static int stripe_iterate_devices(struct dm_target *ti,
 	int ret = 0;
 	unsigned i = 0;
 
-	do
+	do {
 		ret = fn(ti, sc->stripe[i].dev,
-			 sc->stripe[i].physical_start, data);
-	while (!ret && ++i < sc->stripes);
+			 sc->stripe[i].physical_start,
+			 sc->stripe_width, data);
+	} while (!ret && ++i < sc->stripes);
 
 	return ret;
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5d16d06613aa..d952b3441913 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -346,7 +346,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
  * If possible, this checks an area of a destination device is valid.
  */
 static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
-				sector_t start, void *data)
+				sector_t start, sector_t len, void *data)
 {
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
@@ -359,7 +359,7 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
 	if (!dev_size)
 		return 1;
 
-	if ((start >= dev_size) || (start + ti->len > dev_size)) {
+	if ((start >= dev_size) || (start + len > dev_size)) {
 		DMWARN("%s: %s too small for target",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
 		return 0;
@@ -377,11 +377,11 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
 		return 0;
 	}
 
-	if (ti->len & (logical_block_size_sectors - 1)) {
+	if (len & (logical_block_size_sectors - 1)) {
 		DMWARN("%s: len=%llu not aligned to h/w "
 		       "logical block size %hu of %s",
 		       dm_device_name(ti->table->md),
-		       (unsigned long long)ti->len,
+		       (unsigned long long)len,
 		       limits->logical_block_size, bdevname(bdev, b));
 		return 0;
 	}
@@ -482,7 +482,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 
 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
-			 sector_t start, void *data)
+			 sector_t start, sector_t len, void *data)
 {
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
-- 
cgit v1.2.2


From 95fc17aac45300f45968aacd97a536ddd8db8101 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 31 Jul 2009 12:39:15 +1000
Subject: md/raid6: release spare page at ->stop()

Add missing call to safe_put_page from stop() by unifying open coded
raid5_conf_t de-allocation under free_conf().

Cc: <stable@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 37835538b58e..39374230a463 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4316,6 +4316,15 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	return sectors * (raid_disks - conf->max_degraded);
 }
 
+static void free_conf(raid5_conf_t *conf)
+{
+	shrink_stripes(conf);
+	safe_put_page(conf->spare_page);
+	kfree(conf->disks);
+	kfree(conf->stripe_hashtbl);
+	kfree(conf);
+}
+
 static raid5_conf_t *setup_conf(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
@@ -4447,11 +4456,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 
  abort:
 	if (conf) {
-		shrink_stripes(conf);
-		safe_put_page(conf->spare_page);
-		kfree(conf->disks);
-		kfree(conf->stripe_hashtbl);
-		kfree(conf);
+		free_conf(conf);
 		return ERR_PTR(-EIO);
 	} else
 		return ERR_PTR(-ENOMEM);
@@ -4629,12 +4634,8 @@ abort:
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	if (conf) {
-		shrink_stripes(conf);
 		print_raid5_conf(conf);
-		safe_put_page(conf->spare_page);
-		kfree(conf->disks);
-		kfree(conf->stripe_hashtbl);
-		kfree(conf);
+		free_conf(conf);
 	}
 	mddev->private = NULL;
 	printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
@@ -4649,13 +4650,10 @@ static int stop(mddev_t *mddev)
 
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
-	shrink_stripes(conf);
-	kfree(conf->stripe_hashtbl);
 	mddev->queue->backing_dev_info.congested_fn = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
-	kfree(conf->disks);
-	kfree(conf);
+	free_conf(conf);
 	mddev->private = NULL;
 	return 0;
 }
-- 
cgit v1.2.2


From ac5e7113e74872928844d00085bd47c988f12728 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Mon, 3 Aug 2009 10:59:47 +1000
Subject: md: Push down data integrity code to personalities.

This patch replaces md_integrity_check() by two new public functions:
md_integrity_register() and md_integrity_add_rdev() which are both
personality-independent.

md_integrity_register() is called from the ->run and ->hot_remove
methods of all personalities that support data integrity.  The
function iterates over the component devices of the array and
determines if all active devices are integrity capable and if their
profiles match. If this is the case, the common profile is registered
for the mddev via blk_integrity_register().

The second new function, md_integrity_add_rdev() is called from the
->hot_add_disk methods, i.e. whenever a new device is being added
to a raid array. If the new device does not support data integrity,
or has a profile different from the one already registered, data
integrity for the mddev is disabled.

For raid0 and linear, only the call to md_integrity_register() from
the ->run method is necessary.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c    |  1 +
 drivers/md/md.c        | 94 ++++++++++++++++++++++++++++++++++----------------
 drivers/md/md.h        |  2 ++
 drivers/md/multipath.c |  5 ++-
 drivers/md/raid0.c     |  1 +
 drivers/md/raid1.c     |  6 ++--
 drivers/md/raid10.c    |  4 +++
 7 files changed, 80 insertions(+), 33 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5810fa906af0..54c8677f1e59 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -220,6 +220,7 @@ static int linear_run (mddev_t *mddev)
 	mddev->queue->unplug_fn = linear_unplug;
 	mddev->queue->backing_dev_info.congested_fn = linear_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
+	md_integrity_register(mddev);
 	return 0;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d4351ff0849f..180949e94a7b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1487,37 +1487,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 
 static LIST_HEAD(pending_raid_disks);
 
-static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
+/*
+ * Try to register data integrity profile for an mddev
+ *
+ * This is called when an array is started and after a disk has been kicked
+ * from the array. It only succeeds if all working and active component devices
+ * are integrity capable with matching profiles.
+ */
+int md_integrity_register(mddev_t *mddev)
+{
+	mdk_rdev_t *rdev, *reference = NULL;
+
+	if (list_empty(&mddev->disks))
+		return 0; /* nothing to do */
+	if (blk_get_integrity(mddev->gendisk))
+		return 0; /* already registered */
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		/* skip spares and non-functional disks */
+		if (test_bit(Faulty, &rdev->flags))
+			continue;
+		if (rdev->raid_disk < 0)
+			continue;
+		/*
+		 * If at least one rdev is not integrity capable, we can not
+		 * enable data integrity for the md device.
+		 */
+		if (!bdev_get_integrity(rdev->bdev))
+			return -EINVAL;
+		if (!reference) {
+			/* Use the first rdev as the reference */
+			reference = rdev;
+			continue;
+		}
+		/* does this rdev's profile match the reference profile? */
+		if (blk_integrity_compare(reference->bdev->bd_disk,
+				rdev->bdev->bd_disk) < 0)
+			return -EINVAL;
+	}
+	/*
+	 * All component devices are integrity capable and have matching
+	 * profiles, register the common profile for the md device.
+	 */
+	if (blk_integrity_register(mddev->gendisk,
+			bdev_get_integrity(reference->bdev)) != 0) {
+		printk(KERN_ERR "md: failed to register integrity for %s\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+	printk(KERN_NOTICE "md: data integrity on %s enabled\n",
+		mdname(mddev));
+	return 0;
+}
+EXPORT_SYMBOL(md_integrity_register);
+
+/* Disable data integrity if non-capable/non-matching disk is being added */
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
 {
-	struct mdk_personality *pers = mddev->pers;
-	struct gendisk *disk = mddev->gendisk;
 	struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
-	struct blk_integrity *bi_mddev = blk_get_integrity(disk);
+	struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
 
-	/* Data integrity passthrough not supported on RAID 4, 5 and 6 */
-	if (pers && pers->level >= 4 && pers->level <= 6)
+	if (!bi_mddev) /* nothing to do */
 		return;
-
-	/* If rdev is integrity capable, register profile for mddev */
-	if (!bi_mddev && bi_rdev) {
-		if (blk_integrity_register(disk, bi_rdev))
-			printk(KERN_ERR "%s: %s Could not register integrity!\n",
-			       __func__, disk->disk_name);
-		else
-			printk(KERN_NOTICE "Enabling data integrity on %s\n",
-			       disk->disk_name);
+	if (rdev->raid_disk < 0) /* skip spares */
 		return;
-	}
-
-	/* Check that mddev and rdev have matching profiles */
-	if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
-		printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
-		       disk->disk_name, rdev->bdev->bd_disk->disk_name);
-		printk(KERN_NOTICE "Disabling data integrity on %s\n",
-		       disk->disk_name);
-		blk_integrity_unregister(disk);
-	}
+	if (bi_rdev && blk_integrity_compare(mddev->gendisk,
+					     rdev->bdev->bd_disk) >= 0)
+		return;
+	printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
+	blk_integrity_unregister(mddev->gendisk);
 }
+EXPORT_SYMBOL(md_integrity_add_rdev);
 
 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 {
@@ -1591,7 +1630,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 	/* May as well allow recovery to be retried once */
 	mddev->recovery_disabled = 0;
 
-	md_integrity_check(rdev, mddev);
 	return 0;
 
  fail:
@@ -4048,10 +4086,6 @@ static int do_md_run(mddev_t * mddev)
 	}
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
-	if (pers->level >= 4 && pers->level <= 6)
-		/* Cannot support integrity (yet) */
-		blk_integrity_unregister(mddev->gendisk);
-
 	if (mddev->reshape_position != MaxSector &&
 	    pers->start_reshape == NULL) {
 		/* This personality cannot handle reshaping... */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 9430a110db93..78f03168baf9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -431,5 +431,7 @@ extern int md_allow_write(mddev_t *mddev);
 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
 extern int md_check_no_bitmap(mddev_t *mddev);
+extern int md_integrity_register(mddev_t *mddev);
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 237fe3fd235c..7140909f6662 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -313,6 +313,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			set_bit(In_sync, &rdev->flags);
 			rcu_assign_pointer(p->rdev, rdev);
 			err = 0;
+			md_integrity_add_rdev(rdev, mddev);
 			break;
 		}
 
@@ -345,7 +346,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
 			/* lost the race, try later */
 			err = -EBUSY;
 			p->rdev = rdev;
+			goto abort;
 		}
+		md_integrity_register(mddev);
 	}
 abort:
 
@@ -519,7 +522,7 @@ static int multipath_run (mddev_t *mddev)
 	mddev->queue->unplug_fn = multipath_unplug;
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-
+	md_integrity_register(mddev);
 	return 0;
 
 out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 335f490dcad6..898e2bdfee47 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -351,6 +351,7 @@ static int raid0_run(mddev_t *mddev)
 
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
 	dump_zones(mddev);
+	md_integrity_register(mddev);
 	return 0;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0569efba0c02..67e794d0097f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1144,7 +1144,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			rcu_assign_pointer(p->rdev, rdev);
 			break;
 		}
-
+	md_integrity_add_rdev(rdev, mddev);
 	print_conf(conf);
 	return err;
 }
@@ -1178,7 +1178,9 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
 			/* lost the race, try later */
 			err = -EBUSY;
 			p->rdev = rdev;
+			goto abort;
 		}
+		md_integrity_register(mddev);
 	}
 abort:
 
@@ -2067,7 +2069,7 @@ static int run(mddev_t *mddev)
 	mddev->queue->unplug_fn = raid1_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-
+	md_integrity_register(mddev);
 	return 0;
 
 out_no_mem:
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7298a5e5a183..3d9020cf6f6e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1170,6 +1170,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			break;
 		}
 
+	md_integrity_add_rdev(rdev, mddev);
 	print_conf(conf);
 	return err;
 }
@@ -1203,7 +1204,9 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
 			/* lost the race, try later */
 			err = -EBUSY;
 			p->rdev = rdev;
+			goto abort;
 		}
+		md_integrity_register(mddev);
 	}
 abort:
 
@@ -2225,6 +2228,7 @@ static int run(mddev_t *mddev)
 
 	if (conf->near_copies < mddev->raid_disks)
 		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+	md_integrity_register(mddev);
 	return 0;
 
 out_free_conf:
-- 
cgit v1.2.2


From 3a981b03f38dc3b8a69b77cbc679e66c1318a44a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 Aug 2009 10:59:55 +1000
Subject: md: when a level change reduces the number of devices, remove the
 excess.

When an array is changed from RAID6 to RAID5, fewer drives are
needed.  So any device that is made superfluous by the level
conversion must be marked as not-active.
For the RAID6->RAID5 conversion, this will be a drive which only
has 'Q' blocks on it.

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 180949e94a7b..c194955aecae 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2695,6 +2695,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	ssize_t rv = len;
 	struct mdk_personality *pers;
 	void *priv;
+	mdk_rdev_t *rdev;
 
 	if (mddev->pers == NULL) {
 		if (len == 0)
@@ -2774,6 +2775,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	mddev_suspend(mddev);
 	mddev->pers->stop(mddev);
 	module_put(mddev->pers->owner);
+	/* Invalidate devices that are now superfluous */
+	list_for_each_entry(rdev, &mddev->disks, same_set)
+		if (rdev->raid_disk >= mddev->raid_disks) {
+			rdev->raid_disk = -1;
+			clear_bit(In_sync, &rdev->flags);
+		}
 	mddev->pers = pers;
 	mddev->private = priv;
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
-- 
cgit v1.2.2


From 3673f305faf1bc66ead751344f8262ace851ff44 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 Aug 2009 10:59:56 +1000
Subject: md: avoid array overflow with bad v1.x metadata

We trust the 'desc_nr' field in v1.x metadata enough to use it
as an index in an array.  This isn't really safe.
So range-check the value first.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c194955aecae..249b2896d4ea 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1308,7 +1308,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 	}
 	if (mddev->level != LEVEL_MULTIPATH) {
 		int role;
-		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+		if (rdev->desc_nr < 0 ||
+		    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
+			role = 0xffff;
+			rdev->desc_nr = -1;
+		} else
+			role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
 		switch(role) {
 		case 0xffff: /* spare */
 			break;
-- 
cgit v1.2.2


From 70471dafe3390243c598a3165dfb86b8b8b3f4fe Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 Aug 2009 10:59:57 +1000
Subject: md: Handle growth of v1.x metadata correctly.

The v1.x metadata does not have a fixed size and can grow
when devices are added.
If it grows enough to require an extra sector of storage,
we need to update the 'sb_size' to match.

Without this, md can write out an incomplete superblock with a
bad checksum, which will be rejected when trying to re-assemble
the array.

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 249b2896d4ea..52c988b072d0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1399,8 +1399,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		if (rdev2->desc_nr+1 > max_dev)
 			max_dev = rdev2->desc_nr+1;
 
-	if (max_dev > le32_to_cpu(sb->max_dev))
+	if (max_dev > le32_to_cpu(sb->max_dev)) {
+		int bmask;
 		sb->max_dev = cpu_to_le32(max_dev);
+		rdev->sb_size = max_dev * 2 + 256;
+		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
+		if (rdev->sb_size & bmask)
+			rdev->sb_size = (rdev->sb_size | bmask) + 1;
+	}
 	for (i=0; i<max_dev;i++)
 		sb->dev_roles[i] = cpu_to_le16(0xfffe);
 	
-- 
cgit v1.2.2


From e516402c0d4fc02be4af9fa8c18954d4f9deb44e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 Aug 2009 10:59:57 +1000
Subject: md/raid5: set reshape_position correctly when reshape starts.

As the internal reshape_progress counter is the main driver
for reshape, the fact that reshape_position sometimes starts with the
wrong value has minimal effect.  It is visible in sysfs and that
is all.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 39374230a463..659151e5eda4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5000,7 +5000,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	mddev->raid_disks = conf->raid_disks;
-	mddev->reshape_position = 0;
+	mddev->reshape_position = conf->reshape_progress;
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
-- 
cgit v1.2.2


From 64bd660b51b2da92e99a5e97349f6558349f11c5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 Aug 2009 10:59:58 +1000
Subject: md: allow raid5_quiesce to work properly when reshape is happening.

The ->quiesce method is not supposed to stop resync/recovery/reshape,
just normal IO.
But in raid5 we don't have a way to know which stripes are being
used for normal IO and which for resync etc, so we need to wait for
all stripes to be idle to be sure that all writes have completed.

However reshape keeps at least some stripe busy for an extended period
of time, so a call to raid5_quiesce can block for several seconds
needlessly.
So arrange for reshape etc to pause briefly while raid5_quiesce is
trying to quiesce the array so that the active_stripes count can
drop to zero.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 659151e5eda4..2dc35b4c20ac 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3999,6 +3999,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 		return 0;
 	}
 
+	/* Allow raid5_quiesce to complete */
+	wait_event(conf->wait_for_overlap, conf->quiesce != 2);
+
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		return reshape_request(mddev, sector_nr, skipped);
 
@@ -5104,12 +5107,18 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 
 	case 1: /* stop all writes */
 		spin_lock_irq(&conf->device_lock);
-		conf->quiesce = 1;
+		/* '2' tells resync/reshape to pause so that all
+		 * active stripes can drain
+		 */
+		conf->quiesce = 2;
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,
 				    conf->device_lock, /* nothing */);
+		conf->quiesce = 1;
 		spin_unlock_irq(&conf->device_lock);
+		/* allow reshape to continue */
+		wake_up(&conf->wait_for_overlap);
 		break;
 
 	case 0: /* re-enable writes */
-- 
cgit v1.2.2


From 449aad3e25358812c43afc60918c5ad3819488e7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 3 Aug 2009 10:59:58 +1000
Subject: md: Use revalidate_disk to effect changes in size of device.

As revalidate_disk calls check_disk_size_change, it will cause
any capacity change of a gendisk to be propagated to the blockdev
inode.  So use that instead of mucking about with locks and
i_size_write.

Also add a call to revalidate_disk in do_md_run and a few other places
where the gendisk capacity is changed.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c |  1 +
 drivers/md/md.c     | 28 +++++-----------------------
 drivers/md/raid1.c  |  1 +
 drivers/md/raid5.c  | 12 ++----------
 4 files changed, 9 insertions(+), 33 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 54c8677f1e59..5fe39c2a3d2b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -257,6 +257,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	rcu_assign_pointer(mddev->private, newconf);
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 	set_capacity(mddev->gendisk, mddev->array_sectors);
+	revalidate_disk(mddev->gendisk);
 	call_rcu(&oldconf->rcu, free_conf);
 	return 0;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 52c988b072d0..5b98bea4ff9b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3741,17 +3741,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
 
 	mddev->array_sectors = sectors;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
-	if (mddev->pers) {
-		struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
-
-		if (bdev) {
-			mutex_lock(&bdev->bd_inode->i_mutex);
-			i_size_write(bdev->bd_inode,
-				     (loff_t)mddev->array_sectors << 9);
-			mutex_unlock(&bdev->bd_inode->i_mutex);
-			bdput(bdev);
-		}
-	}
+	if (mddev->pers)
+		revalidate_disk(mddev->gendisk);
 
 	return len;
 }
@@ -4241,6 +4232,7 @@ static int do_md_run(mddev_t * mddev)
 	md_wakeup_thread(mddev->thread);
 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 
+	revalidate_disk(mddev->gendisk);
 	mddev->changed = 1;
 	md_new_event(mddev);
 	sysfs_notify_dirent(mddev->sysfs_state);
@@ -5139,18 +5131,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
 			return -ENOSPC;
 	}
 	rv = mddev->pers->resize(mddev, num_sectors);
-	if (!rv) {
-		struct block_device *bdev;
-
-		bdev = bdget_disk(mddev->gendisk, 0);
-		if (bdev) {
-			mutex_lock(&bdev->bd_inode->i_mutex);
-			i_size_write(bdev->bd_inode,
-				     (loff_t)mddev->array_sectors << 9);
-			mutex_unlock(&bdev->bd_inode->i_mutex);
-			bdput(bdev);
-		}
-	}
+	if (!rv)
+		revalidate_disk(mddev->gendisk);
 	return rv;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 67e794d0097f..8726fd7ebce5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2134,6 +2134,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 		return -EINVAL;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev->changed = 1;
+	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors &&
 	    mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->dev_sectors;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2dc35b4c20ac..2b521ee67dfa 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4858,6 +4858,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 		return -EINVAL;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev->changed = 1;
+	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->dev_sectors;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -5058,7 +5059,6 @@ static void end_reshape(raid5_conf_t *conf)
  */
 static void raid5_finish_reshape(mddev_t *mddev)
 {
-	struct block_device *bdev;
 	raid5_conf_t *conf = mddev->private;
 
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5067,15 +5067,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
 			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 			set_capacity(mddev->gendisk, mddev->array_sectors);
 			mddev->changed = 1;
-
-			bdev = bdget_disk(mddev->gendisk, 0);
-			if (bdev) {
-				mutex_lock(&bdev->bd_inode->i_mutex);
-				i_size_write(bdev->bd_inode,
-					     (loff_t)mddev->array_sectors << 9);
-				mutex_unlock(&bdev->bd_inode->i_mutex);
-				bdput(bdev);
-			}
+			revalidate_disk(mddev->gendisk);
 		} else {
 			int d;
 			mddev->degraded = conf->raid_disks;
-- 
cgit v1.2.2


From c8c00a6915a2e3d10416e8bdd3138429beb96210 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 10 Aug 2009 12:50:52 +1000
Subject: Remove deadlock potential in md_open

A recent commit:
  commit 449aad3e25358812c43afc60918c5ad3819488e7

introduced the possibility of an A-B/B-A deadlock between
bd_mutex and reconfig_mutex.

__blkdev_get holds bd_mutex while calling md_open which takes
   reconfig_mutex,
do_md_run is always called with reconfig_mutex held, and it now
   takes bd_mutex in the call the revalidate_disk.

This potential deadlock was not caught by lockdep due to the
use of mutex_lock_interruptible_nexted which was introduced
by
   commit d63a5a74dee87883fda6b7d170244acaac5b05e8
do avoid a warning of an impossible deadlock.

It is quite possible to split reconfig_mutex in to two locks.
One protects the array data structures while it is being
reconfigured, the other ensures that an array is never even partially
open while it is being deactivated.
In particular, the second lock prevents an open from completing
between the time when do_md_stop checks if there are any active opens,
and the time when the array is either set read-only, or when ->pers is
set to NULL.  So we can be certain that no IO is in flight as the
array is being destroyed.

So create a new lock, open_mutex, just to ensure exclusion between
'open' and 'stop'.

This avoids the deadlock and also avoids the lockdep warning mentioned
in commit d63a5a74d

Reported-by: "Mike Snitzer" <snitzer@gmail.com>
Reported-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 18 ++++++++++--------
 drivers/md/md.h | 10 ++++++++++
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5b98bea4ff9b..5614500092e3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -359,6 +359,7 @@ static mddev_t * mddev_find(dev_t unit)
 	else
 		new->md_minor = MINOR(unit) >> MdpMinorShift;
 
+	mutex_init(&new->open_mutex);
 	mutex_init(&new->reconfig_mutex);
 	INIT_LIST_HEAD(&new->disks);
 	INIT_LIST_HEAD(&new->all_mddevs);
@@ -4304,12 +4305,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	struct gendisk *disk = mddev->gendisk;
 	mdk_rdev_t *rdev;
 
+	mutex_lock(&mddev->open_mutex);
 	if (atomic_read(&mddev->openers) > is_open) {
 		printk("md: %s still in use.\n",mdname(mddev));
-		return -EBUSY;
-	}
-
-	if (mddev->pers) {
+		err = -EBUSY;
+	} else if (mddev->pers) {
 
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4367,7 +4367,10 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			set_disk_ro(disk, 1);
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	}
-
+out:
+	mutex_unlock(&mddev->open_mutex);
+	if (err)
+		return err;
 	/*
 	 * Free resources if final stop
 	 */
@@ -4433,7 +4436,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	blk_integrity_unregister(disk);
 	md_new_event(mddev);
 	sysfs_notify_dirent(mddev->sysfs_state);
-out:
 	return err;
 }
 
@@ -5518,12 +5520,12 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	}
 	BUG_ON(mddev != bdev->bd_disk->private_data);
 
-	if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
+	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
 		goto out;
 
 	err = 0;
 	atomic_inc(&mddev->openers);
-	mddev_unlock(mddev);
+	mutex_unlock(&mddev->open_mutex);
 
 	check_disk_change(bdev);
  out:
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 78f03168baf9..f8fc188bc762 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -223,6 +223,16 @@ struct mddev_s
 							    * so we don't loop trying */
 
 	int				in_sync;	/* know to not need resync */
+	/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
+	 * that we are never stopping an array while it is open.
+	 * 'reconfig_mutex' protects all other reconfiguration.
+	 * These locks are separate due to conflicting interactions
+	 * with bdev->bd_mutex.
+	 * Lock ordering is:
+	 *  reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+	 *  bd_mutex -> open_mutex:  e.g. __blkdev_get -> md_open
+	 */
+	struct mutex			open_mutex;
 	struct mutex			reconfig_mutex;
 	atomic_t			active;		/* general refcount */
 	atomic_t			openers;	/* number of active opens */
-- 
cgit v1.2.2


From 51d5668cb2e3fd1827a55184e48606fff054c5be Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 09:54:02 +1000
Subject: md: never advance 'events' counter by more than 1.

When assembling arrays, md allows two devices to have different event
counts as long as the difference is only '1'.  This is to cope with
a system failure between updating the metadata on two difference
devices.

However there are currently times when we update the event count by
2.  This was done to keep the event count even when the array is clean
and odd when it is dirty, which allows us to avoid writing common
update to spare devices and so allow those spares to go to sleep.

This is bad for the above reason.  So change it to never increase by
two.  This means that the alignment between 'odd/even' and
'clean/dirty' might take a little longer to attain, but that is only a
small cost.  The spares will get a few more updates but that will
still be spared (;-) most updates and can still go to sleep.

Prior to this patch there was a small chance that after a crash an
array would fail to assemble due to the overly large event count
mismatch.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5614500092e3..d18805fea111 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1975,17 +1975,14 @@ repeat:
 		/* otherwise we have to go forward and ... */
 		mddev->events ++;
 		if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
-			/* .. if the array isn't clean, insist on an odd 'events' */
-			if ((mddev->events&1)==0) {
-				mddev->events++;
+			/* .. if the array isn't clean, an 'even' event must also go
+			 * to spares. */
+			if ((mddev->events&1)==0)
 				nospares = 0;
-			}
 		} else {
-			/* otherwise insist on an even 'events' (for clean states) */
-			if ((mddev->events&1)) {
-				mddev->events++;
+			/* otherwise an 'odd' event must go to spares */
+			if ((mddev->events&1))
 				nospares = 0;
-			}
 		}
 	}
 
-- 
cgit v1.2.2


From 67ac6011db5d2b0c853d573ff474b25c85dfb644 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 10:06:24 +1000
Subject: md/raid5: allow new reshape modes to be restarted in the middle.

md/raid5 doesn't allow a reshape to restart if it involves writing
over the same part of disk that it would be reading from.
This happens at the beginning of a reshape that increases the number
of devices, at the end of a reshape that decreases the number of
devices, and continuously for a reshape that does not change the
number of devices.

The current code is correct for the "increase number of devices"
case as the critical section at the start is handled by userspace
performing a backup.

It does not work for reducing the number of devices, or the
no-change case.
For 'reducing', we need to invert the test.  For no-change we cannot
really be sure things will be safe, so simply require the array
to be read-only, which is how the user-space code which carefully
starts such arrays works.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2b521ee67dfa..b8a22a2205cf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4509,7 +4509,26 @@ static int run(mddev_t *mddev)
 			   (old_disks-max_degraded));
 		/* here_old is the first stripe that we might need to read
 		 * from */
-		if (here_new >= here_old) {
+		if (mddev->delta_disks == 0) {
+			/* We cannot be sure it is safe to start an in-place
+			 * reshape.  It is only safe if user-space if monitoring
+			 * and taking constant backups.
+			 * mdadm always starts a situation like this in
+			 * readonly mode so it can take control before
+			 * allowing any writes.  So just check for that.
+			 */
+			if ((here_new * mddev->new_chunk_sectors != 
+			     here_old * mddev->chunk_sectors) ||
+			    mddev->ro == 0) {
+				printk(KERN_ERR "raid5: in-place reshape must be started"
+				       " in read-only mode - aborting\n");
+				return -EINVAL;
+			}
+		} else if (mddev->delta_disks < 0
+		    ? (here_new * mddev->new_chunk_sectors <=
+		       here_old * mddev->chunk_sectors)
+		    : (here_new * mddev->new_chunk_sectors >=
+		       here_old * mddev->chunk_sectors)) {
 			/* Reading from the same stripe as writing to - bad */
 			printk(KERN_ERR "raid5: reshape_position too early for "
 			       "auto-recovery - aborting.\n");
-- 
cgit v1.2.2


From a639755cf885e437b2fe4168d35157fa90d530ab Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 10:13:00 +1000
Subject: md/raid5: make sure a reshape restarts at the correct address.

This "if" don't allow for the possibility that the number of devices
doesn't change, and so sector_nr isn't set correctly in that case.
So change '>' to '>='.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b8a22a2205cf..94a74cb5cccb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3785,7 +3785,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
 			sector_nr = raid5_size(mddev, 0, 0)
 				- conf->reshape_progress;
-		} else if (mddev->delta_disks > 0 &&
+		} else if (mddev->delta_disks >= 0 &&
 			   conf->reshape_progress > 0)
 			sector_nr = conf->reshape_progress;
 		sector_div(sector_nr, new_data_disks);
-- 
cgit v1.2.2


From 1a67dde0abba36421a1257d01ba9de2f6d1c160a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 10:41:49 +1000
Subject: md/raid5: Properly remove excess drives after shrinking a raid5/6

We were removing the drives, from the array, but not
removing symlinks from /sys/.... and not marking the device
as having been removed.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 94a74cb5cccb..b8a2c5dc67ba 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5097,8 +5097,15 @@ static void raid5_finish_reshape(mddev_t *mddev)
 					mddev->degraded--;
 			for (d = conf->raid_disks ;
 			     d < conf->raid_disks - mddev->delta_disks;
-			     d++)
-				raid5_remove_disk(mddev, d);
+			     d++) {
+				mdk_rdev_t *rdev = conf->disks[d].rdev;
+				if (rdev && raid5_remove_disk(mddev, d) == 0) {
+					char nm[20];
+					sprintf(nm, "rd%d", rdev->raid_disk);
+					sysfs_remove_link(&mddev->kobj, nm);
+					rdev->raid_disk = -1;
+				}
+			}
 		}
 		mddev->layout = conf->algorithm;
 		mddev->chunk_sectors = conf->chunk_sectors;
-- 
cgit v1.2.2


From 4d484a4a7a5126410eed5f8dd329a33f6eeed068 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Aug 2009 10:41:50 +1000
Subject: md: allow upper limit for resync/reshape to be set when array is
 read-only

Normally we only allow the upper limit for a reshape to be decreased
when the array not performing a sync/recovery/reshape, otherwise there
could be races.  But if an array is part-way through a reshape when it
is assembled the reshape is started immediately leaving no window
to set an upper bound.

If the array is started read-only, the reshape will be suspended until
the array becomes writable, so that provides a window during which it
is perfectly safe to reduce the upper limit of a reshape.

So: allow the upper limit (sync_max) to be reduced even if the reshape
thread is running, as long as the array is still read-only.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d18805fea111..103f2d33fa89 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3599,6 +3599,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
 		if (max < mddev->resync_min)
 			return -EINVAL;
 		if (max < mddev->resync_max &&
+		    mddev->ro == 0 &&
 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 			return -EBUSY;
 
-- 
cgit v1.2.2


From 80ffb3cceaefa405f2ecd46d66500ed8d53efe74 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 18 Aug 2009 10:35:26 +1000
Subject: Fix new incorrect error return from do_md_stop.

Recent commit c8c00a6915a2e3d10416e8bdd3138429beb96210
changed the exit paths in do_md_stop and was not quite
careful enough.  There is one path were 'err' now needs
to be cleared but it isn't.
So setting an array to readonly (with mdadm --readonly) will
work, but will incorrectly report and error: ENXIO.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 103f2d33fa89..9dd872000cec 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4364,6 +4364,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		if (mode == 1)
 			set_disk_ro(disk, 1);
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		err = 0;
 	}
 out:
 	mutex_unlock(&mddev->open_mutex);
-- 
cgit v1.2.2