5 files changed, 141 insertions, 108 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 02db9183ca01..77e6eff41cae 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -740,8 +740,14 @@ static void rq_completed(struct mapped_device *md, int rw, int run_queue)
        if (!md_in_flight(md))
                wake_up(&md->wait);
+        /*
+         * Run this off this callpath, as drivers could invoke end_io while
+         * inside their request_fn (and holding the queue lock). Calling
+         * back into ->request_fn() could deadlock attempting to grab the
+         * queue lock again.
+         */
        if (run_queue)
-                blk_run_queue(md->queue);
+                blk_run_queue_async(md->queue);
        /*
         * dm_put() must be at the end of this function. See the comment above
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9ab768acfb62..61200717687b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1817,10 +1817,10 @@ retry:
                        memset(bbp, 0xff, PAGE_SIZE);
                        for (i = 0 ; i < bb->count ; i++) {
-                                u64 internal_bb = *p++;
+                                u64 internal_bb = p[i];
                                u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
                                                | BB_LEN(internal_bb));
-                                *bbp++ = cpu_to_le64(store_bb);
+                                bbp[i] = cpu_to_le64(store_bb);
                        }
                        bb->changed = 0;
                        if (read_seqretry(&bb->lock, seq))
@@ -5294,7 +5294,7 @@ void md_stop_writes(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(md_stop_writes);
-void md_stop(struct mddev *mddev)
+static void __md_stop(struct mddev *mddev)
 {
        mddev->ready = 0;
        mddev->pers->stop(mddev);
@@ -5304,6 +5304,18 @@ void md_stop(struct mddev *mddev)
        mddev->pers = NULL;
        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 }
+void md_stop(struct mddev *mddev)
+{
+        /* stop the array and free an attached data structures.
+         * This is called from dm-raid
+         */
+        __md_stop(mddev);
+        bitmap_destroy(mddev);
+        if (mddev->bio_set)
+                bioset_free(mddev->bio_set);
+}
 EXPORT_SYMBOL_GPL(md_stop);
 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
@@ -5364,7 +5376,7 @@ static int do_md_stop(struct mddev * mddev, int mode,
                        set_disk_ro(disk, 0);
                __md_stop_writes(mddev);
-                md_stop(mddev);
+                __md_stop(mddev);
                mddev->queue->merge_bvec_fn = NULL;
                mddev->queue->backing_dev_info.congested_fn = NULL;
@@ -7936,9 +7948,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
                   sector_t *first_bad, int *bad_sectors)
 {
        int hi;
-        int lo = 0;
+        int lo;
        u64 *p = bb->page;
-        int rv = 0;
+        int rv;
        sector_t target = s + sectors;
        unsigned seq;
@@ -7953,7 +7965,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
 retry:
        seq = read_seqbegin(&bb->lock);
+        lo = 0;
+        rv = 0;
        hi = bb->count;
        /* Binary search between lo and hi for 'target'
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 636bae0405e8..a0f73092176e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -963,7 +963,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
        struct r1conf *conf = mddev->private;
        struct bio *bio;
-        if (from_schedule) {
+        if (from_schedule || current->bio_list) {
                spin_lock_irq(&conf->device_lock);
                bio_list_merge(&conf->pending_bio_list, &plug->pending);
                conf->pending_count += plug->pending_cnt;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d1295aff4173..c9acbd717131 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
         */
        one_write_done(r10_bio);
        if (dec_rdev)
-                rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
+                rdev_dec_pending(rdev, conf->mddev);
 }
 /*
@@ -1069,7 +1069,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
        struct r10conf *conf = mddev->private;
        struct bio *bio;
-        if (from_schedule) {
+        if (from_schedule || current->bio_list) {
                spin_lock_irq(&conf->device_lock);
                bio_list_merge(&conf->pending_bio_list, &plug->pending);
                conf->pending_count += plug->pending_cnt;
@@ -1334,18 +1334,21 @@ retry_write:
                        blocked_rdev = rrdev;
                        break;
                }
+                if (rdev && (test_bit(Faulty, &rdev->flags)
+                             || test_bit(Unmerged, &rdev->flags)))
+                        rdev = NULL;
                if (rrdev && (test_bit(Faulty, &rrdev->flags)
                              || test_bit(Unmerged, &rrdev->flags)))
                        rrdev = NULL;
                r10_bio->devs[i].bio = NULL;
                r10_bio->devs[i].repl_bio = NULL;
-                if (!rdev || test_bit(Faulty, &rdev->flags) ||
-                    test_bit(Unmerged, &rdev->flags)) {
+                if (!rdev && !rrdev) {
                        set_bit(R10BIO_Degraded, &r10_bio->state);
                        continue;
                }
-                if (test_bit(WriteErrorSeen, &rdev->flags)) {
+                if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
                        sector_t first_bad;
                        sector_t dev_sector = r10_bio->devs[i].addr;
                        int bad_sectors;
@@ -1387,8 +1390,10 @@ retry_write:
                                        max_sectors = good_sectors;
                        }
                }
-                r10_bio->devs[i].bio = bio;
+                if (rdev) {
-                atomic_inc(&rdev->nr_pending);
+                        r10_bio->devs[i].bio = bio;
+                        atomic_inc(&rdev->nr_pending);
+                }
                if (rrdev) {
                        r10_bio->devs[i].repl_bio = bio;
                        atomic_inc(&rrdev->nr_pending);
@@ -1444,69 +1449,71 @@ retry_write:
        for (i = 0; i < conf->copies; i++) {
                struct bio *mbio;
                int d = r10_bio->devs[i].devnum;
-                if (!r10_bio->devs[i].bio)
+                if (r10_bio->devs[i].bio) {
-                        continue;
+                        struct md_rdev *rdev = conf->mirrors[d].rdev;
+                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                        md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+                                    max_sectors);
+                        r10_bio->devs[i].bio = mbio;
+                        mbio->bi_sector = (r10_bio->devs[i].addr+
+                                           choose_data_offset(r10_bio,
+                                                              rdev));
+                        mbio->bi_bdev = rdev->bdev;
+                        mbio->bi_end_io = raid10_end_write_request;
+                        mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+                        mbio->bi_private = r10_bio;
-                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                        atomic_inc(&r10_bio->remaining);
-                md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-                            max_sectors);
-                r10_bio->devs[i].bio = mbio;
-                mbio->bi_sector = (r10_bio->devs[i].addr+
+                        cb = blk_check_plugged(raid10_unplug, mddev,
-                                   choose_data_offset(r10_bio,
+                                               sizeof(*plug));
-                                                      conf->mirrors[d].rdev));
+                        if (cb)
-                mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
+                                plug = container_of(cb, struct raid10_plug_cb,
-                mbio->bi_end_io = raid10_end_write_request;
+                                                    cb);
-                mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+                        else
-                mbio->bi_private = r10_bio;
+                                plug = NULL;
+                        spin_lock_irqsave(&conf->device_lock, flags);
+                        if (plug) {
+                                bio_list_add(&plug->pending, mbio);
+                                plug->pending_cnt++;
+                        } else {
+                                bio_list_add(&conf->pending_bio_list, mbio);
+                                conf->pending_count++;
+                        }
+                        spin_unlock_irqrestore(&conf->device_lock, flags);
+                        if (!plug)
+                                md_wakeup_thread(mddev->thread);
+                }
-                atomic_inc(&r10_bio->remaining);
+                if (r10_bio->devs[i].repl_bio) {
+                        struct md_rdev *rdev = conf->mirrors[d].replacement;
+                        if (rdev == NULL) {
+                                /* Replacement just got moved to main 'rdev' */
+                                smp_mb();
+                                rdev = conf->mirrors[d].rdev;
+                        }
+                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                        md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+                                    max_sectors);
+                        r10_bio->devs[i].repl_bio = mbio;
+                        mbio->bi_sector = (r10_bio->devs[i].addr +
+                                           choose_data_offset(
+                                                   r10_bio, rdev));
+                        mbio->bi_bdev = rdev->bdev;
+                        mbio->bi_end_io = raid10_end_write_request;
+                        mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+                        mbio->bi_private = r10_bio;
-                cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
+                        atomic_inc(&r10_bio->remaining);
-                if (cb)
+                        spin_lock_irqsave(&conf->device_lock, flags);
-                        plug = container_of(cb, struct raid10_plug_cb, cb);
-                else
-                        plug = NULL;
-                spin_lock_irqsave(&conf->device_lock, flags);
-                if (plug) {
-                        bio_list_add(&plug->pending, mbio);
-                        plug->pending_cnt++;
-                } else {
                        bio_list_add(&conf->pending_bio_list, mbio);
                        conf->pending_count++;
+                        spin_unlock_irqrestore(&conf->device_lock, flags);
+                        if (!mddev_check_plugged(mddev))
+                                md_wakeup_thread(mddev->thread);
                }
-                spin_unlock_irqrestore(&conf->device_lock, flags);
-                if (!plug)
-                        md_wakeup_thread(mddev->thread);
-                if (!r10_bio->devs[i].repl_bio)
-                        continue;
-                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-                            max_sectors);
-                r10_bio->devs[i].repl_bio = mbio;
-                /* We are actively writing to the original device
-                 * so it cannot disappear, so the replacement cannot
-                 * become NULL here
-                 */
-                mbio->bi_sector = (r10_bio->devs[i].addr +
-                                   choose_data_offset(
-                                           r10_bio,
-                                           conf->mirrors[d].replacement));
-                mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
-                mbio->bi_end_io = raid10_end_write_request;
-                mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
-                mbio->bi_private = r10_bio;
-                atomic_inc(&r10_bio->remaining);
-                spin_lock_irqsave(&conf->device_lock, flags);
-                bio_list_add(&conf->pending_bio_list, mbio);
-                conf->pending_count++;
-                spin_unlock_irqrestore(&conf->device_lock, flags);
-                if (!mddev_check_plugged(mddev))
-                        md_wakeup_thread(mddev->thread);
        }
        /* Don't remove the bias on 'remaining' (one_write_done) until
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c5439dce0295..a4502686e7a8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2774,10 +2774,12 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                        dev = &sh->dev[i];
                        if (!test_bit(R5_LOCKED, &dev->flags) &&
                            (test_bit(R5_UPTODATE, &dev->flags) ||
-                             test_and_clear_bit(R5_Discard, &dev->flags))) {
+                             test_bit(R5_Discard, &dev->flags))) {
                                /* We can return any write requests */
                                struct bio *wbi, *wbi2;
                                pr_debug("Return write for disc %d\n", i);
+                                if (test_and_clear_bit(R5_Discard, &dev->flags))
+                                        clear_bit(R5_UPTODATE, &dev->flags);
                                wbi = dev->written;
                                dev->written = NULL;
                                while (wbi && wbi->bi_sector <
@@ -2795,7 +2797,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                                         !test_bit(STRIPE_DEGRADED, &sh->state),
                                                0);
                        }
-                }
+                } else if (test_bit(R5_Discard, &sh->dev[i].flags))
+                        clear_bit(R5_Discard, &sh->dev[i].flags);
        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
                if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -3490,40 +3493,6 @@ static void handle_stripe(struct stripe_head *sh)
                        handle_failed_sync(conf, sh, &s);
        }
-        /*
-         * might be able to return some write requests if the parity blocks
-         * are safe, or on a failed drive
-         */
-        pdev = &sh->dev[sh->pd_idx];
-        s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
-                || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
-        qdev = &sh->dev[sh->qd_idx];
-        s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
-                || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
-                || conf->level < 6;
-        if (s.written &&
-            (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
-                             && !test_bit(R5_LOCKED, &pdev->flags)
-                             && (test_bit(R5_UPTODATE, &pdev->flags) ||
-                                 test_bit(R5_Discard, &pdev->flags))))) &&
-            (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
-                             && !test_bit(R5_LOCKED, &qdev->flags)
-                             && (test_bit(R5_UPTODATE, &qdev->flags) ||
-                                 test_bit(R5_Discard, &qdev->flags))))))
-                handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
-        /* Now we might consider reading some blocks, either to check/generate
-         * parity, or to satisfy requests
-         * or to load a block that is being partially written.
-         */
-        if (s.to_read || s.non_overwrite
-            || (conf->level == 6 && s.to_write && s.failed)
-            || (s.syncing && (s.uptodate + s.compute < disks))
-            || s.replacing
-            || s.expanding)
-                handle_stripe_fill(sh, &s, disks);
        /* Now we check to see if any write operations have recently
         * completed
         */
@@ -3561,6 +3530,40 @@ static void handle_stripe(struct stripe_head *sh)
                        s.dec_preread_active = 1;
        }
+        /*
+         * might be able to return some write requests if the parity blocks
+         * are safe, or on a failed drive
+         */
+        pdev = &sh->dev[sh->pd_idx];
+        s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+                || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+        qdev = &sh->dev[sh->qd_idx];
+        s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+                || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+                || conf->level < 6;
+        if (s.written &&
+            (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+                             && !test_bit(R5_LOCKED, &pdev->flags)
+                             && (test_bit(R5_UPTODATE, &pdev->flags) ||
+                                 test_bit(R5_Discard, &pdev->flags))))) &&
+            (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+                             && !test_bit(R5_LOCKED, &qdev->flags)
+                             && (test_bit(R5_UPTODATE, &qdev->flags) ||
+                                 test_bit(R5_Discard, &qdev->flags))))))
+                handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+        /* Now we might consider reading some blocks, either to check/generate
+         * parity, or to satisfy requests
+         * or to load a block that is being partially written.
+         */
+        if (s.to_read || s.non_overwrite
+            || (conf->level == 6 && s.to_write && s.failed)
+            || (s.syncing && (s.uptodate + s.compute < disks))
+            || s.replacing
+            || s.expanding)
+                handle_stripe_fill(sh, &s, disks);
        /* Now to consider new write requests and what else, if anything
         * should be read.  We do not handle new writes when:
         * 1/ A 'write' operation (copy+xor) is already in flight.
@@ -5529,6 +5532,10 @@ static int run(struct mddev *mddev)
                 * discard data disk but write parity disk
                 */
                stripe = stripe * PAGE_SIZE;
+                /* Round up to power of 2, as discard handling
+                 * currently assumes that */
+                while ((stripe-1) & stripe)
+                        stripe = (stripe | (stripe-1)) + 1;
                mddev->queue->limits.discard_alignment = stripe;
                mddev->queue->limits.discard_granularity = stripe;
                /*

diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 02db9183ca01..77e6eff41cae 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c
@@ -740,8 +740,14 @@ static void rq_completed(struct mapped_device *md, int rw, int run_queue)
740	if (!md_in_flight(md))	740	if (!md_in_flight(md))
741	wake_up(&md->wait);	741	wake_up(&md->wait);
742		742
		743	/*
		744	* Run this off this callpath, as drivers could invoke end_io while
		745	* inside their request_fn (and holding the queue lock). Calling
		746	* back into ->request_fn() could deadlock attempting to grab the
		747	* queue lock again.
		748	*/
743	if (run_queue)	749	if (run_queue)
744	blk_run_queue(md->queue);	750	blk_run_queue_async(md->queue);
745		751
746	/*	752	/*
747	* dm_put() must be at the end of this function. See the comment above	753	* dm_put() must be at the end of this function. See the comment above


diff --git a/drivers/md/md.c b/drivers/md/md.c index 9ab768acfb62..61200717687b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c
@@ -1817,10 +1817,10 @@ retry:
1817	memset(bbp, 0xff, PAGE_SIZE);	1817	memset(bbp, 0xff, PAGE_SIZE);
1818		1818
1819	for (i = 0 ; i < bb->count ; i++) {	1819	for (i = 0 ; i < bb->count ; i++) {
1820	u64 internal_bb = *p++;	1820	u64 internal_bb = p[i];
1821	u64 store_bb = ((BB_OFFSET(internal_bb) << 10)	1821	u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1822	\| BB_LEN(internal_bb));	1822	\| BB_LEN(internal_bb));
1823	*bbp++ = cpu_to_le64(store_bb);	1823	bbp[i] = cpu_to_le64(store_bb);
1824	}	1824	}
1825	bb->changed = 0;	1825	bb->changed = 0;
1826	if (read_seqretry(&bb->lock, seq))	1826	if (read_seqretry(&bb->lock, seq))
@@ -5294,7 +5294,7 @@ void md_stop_writes(struct mddev *mddev)
5294	}	5294	}
5295	EXPORT_SYMBOL_GPL(md_stop_writes);	5295	EXPORT_SYMBOL_GPL(md_stop_writes);
5296		5296
5297	void md_stop(struct mddev *mddev)	5297	static void __md_stop(struct mddev *mddev)
5298	{	5298	{
5299	mddev->ready = 0;	5299	mddev->ready = 0;
5300	mddev->pers->stop(mddev);	5300	mddev->pers->stop(mddev);
@@ -5304,6 +5304,18 @@ void md_stop(struct mddev *mddev)
5304	mddev->pers = NULL;	5304	mddev->pers = NULL;
5305	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);	5305	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5306	}	5306	}
		5307
		5308	void md_stop(struct mddev *mddev)
		5309	{
		5310	/* stop the array and free an attached data structures.
		5311	* This is called from dm-raid
		5312	*/
		5313	__md_stop(mddev);
		5314	bitmap_destroy(mddev);
		5315	if (mddev->bio_set)
		5316	bioset_free(mddev->bio_set);
		5317	}
		5318
5307	EXPORT_SYMBOL_GPL(md_stop);	5319	EXPORT_SYMBOL_GPL(md_stop);
5308		5320
5309	static int md_set_readonly(struct mddev mddev, struct block_device bdev)	5321	static int md_set_readonly(struct mddev mddev, struct block_device bdev)
@@ -5364,7 +5376,7 @@ static int do_md_stop(struct mddev * mddev, int mode,
5364	set_disk_ro(disk, 0);	5376	set_disk_ro(disk, 0);
5365		5377
5366	__md_stop_writes(mddev);	5378	__md_stop_writes(mddev);
5367	md_stop(mddev);	5379	__md_stop(mddev);
5368	mddev->queue->merge_bvec_fn = NULL;	5380	mddev->queue->merge_bvec_fn = NULL;
5369	mddev->queue->backing_dev_info.congested_fn = NULL;	5381	mddev->queue->backing_dev_info.congested_fn = NULL;
5370		5382
@@ -7936,9 +7948,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7936	sector_t first_bad, int bad_sectors)	7948	sector_t first_bad, int bad_sectors)
7937	{	7949	{
7938	int hi;	7950	int hi;
7939	int lo = 0;	7951	int lo;
7940	u64 *p = bb->page;	7952	u64 *p = bb->page;
7941	int rv = 0;	7953	int rv;
7942	sector_t target = s + sectors;	7954	sector_t target = s + sectors;
7943	unsigned seq;	7955	unsigned seq;
7944		7956
@@ -7953,7 +7965,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7953		7965
7954	retry:	7966	retry:
7955	seq = read_seqbegin(&bb->lock);	7967	seq = read_seqbegin(&bb->lock);
7956		7968	lo = 0;
		7969	rv = 0;
7957	hi = bb->count;	7970	hi = bb->count;
7958		7971
7959	/* Binary search between lo and hi for 'target'	7972	/* Binary search between lo and hi for 'target'


diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 636bae0405e8..a0f73092176e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -963,7 +963,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
963	struct r1conf *conf = mddev->private;	963	struct r1conf *conf = mddev->private;
964	struct bio *bio;	964	struct bio *bio;
965		965
966	if (from_schedule) {	966	if (from_schedule \|\| current->bio_list) {
967	spin_lock_irq(&conf->device_lock);	967	spin_lock_irq(&conf->device_lock);
968	bio_list_merge(&conf->pending_bio_list, &plug->pending);	968	bio_list_merge(&conf->pending_bio_list, &plug->pending);
969	conf->pending_count += plug->pending_cnt;	969	conf->pending_count += plug->pending_cnt;


diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d1295aff4173..c9acbd717131 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
499	*/	499	*/
500	one_write_done(r10_bio);	500	one_write_done(r10_bio);
501	if (dec_rdev)	501	if (dec_rdev)
502	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);	502	rdev_dec_pending(rdev, conf->mddev);
503	}	503	}
504		504
505	/*	505	/*
@@ -1069,7 +1069,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1069	struct r10conf *conf = mddev->private;	1069	struct r10conf *conf = mddev->private;
1070	struct bio *bio;	1070	struct bio *bio;
1071		1071
1072	if (from_schedule) {	1072	if (from_schedule \|\| current->bio_list) {
1073	spin_lock_irq(&conf->device_lock);	1073	spin_lock_irq(&conf->device_lock);
1074	bio_list_merge(&conf->pending_bio_list, &plug->pending);	1074	bio_list_merge(&conf->pending_bio_list, &plug->pending);
1075	conf->pending_count += plug->pending_cnt;	1075	conf->pending_count += plug->pending_cnt;
@@ -1334,18 +1334,21 @@ retry_write:
1334	blocked_rdev = rrdev;	1334	blocked_rdev = rrdev;
1335	break;	1335	break;
1336	}	1336	}
		1337	if (rdev && (test_bit(Faulty, &rdev->flags)
		1338	\|\| test_bit(Unmerged, &rdev->flags)))
		1339	rdev = NULL;
1337	if (rrdev && (test_bit(Faulty, &rrdev->flags)	1340	if (rrdev && (test_bit(Faulty, &rrdev->flags)
1338	\|\| test_bit(Unmerged, &rrdev->flags)))	1341	\|\| test_bit(Unmerged, &rrdev->flags)))
1339	rrdev = NULL;	1342	rrdev = NULL;
1340		1343
1341	r10_bio->devs[i].bio = NULL;	1344	r10_bio->devs[i].bio = NULL;
1342	r10_bio->devs[i].repl_bio = NULL;	1345	r10_bio->devs[i].repl_bio = NULL;
1343	if (!rdev \|\| test_bit(Faulty, &rdev->flags) \|\|	1346
1344	test_bit(Unmerged, &rdev->flags)) {	1347	if (!rdev && !rrdev) {
1345	set_bit(R10BIO_Degraded, &r10_bio->state);	1348	set_bit(R10BIO_Degraded, &r10_bio->state);
1346	continue;	1349	continue;
1347	}	1350	}
1348	if (test_bit(WriteErrorSeen, &rdev->flags)) {	1351	if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1349	sector_t first_bad;	1352	sector_t first_bad;
1350	sector_t dev_sector = r10_bio->devs[i].addr;	1353	sector_t dev_sector = r10_bio->devs[i].addr;
1351	int bad_sectors;	1354	int bad_sectors;
@@ -1387,8 +1390,10 @@ retry_write:
1387	max_sectors = good_sectors;	1390	max_sectors = good_sectors;
1388	}	1391	}
1389	}	1392	}
1390	r10_bio->devs[i].bio = bio;	1393	if (rdev) {
1391	atomic_inc(&rdev->nr_pending);	1394	r10_bio->devs[i].bio = bio;
		1395	atomic_inc(&rdev->nr_pending);
		1396	}
1392	if (rrdev) {	1397	if (rrdev) {
1393	r10_bio->devs[i].repl_bio = bio;	1398	r10_bio->devs[i].repl_bio = bio;
1394	atomic_inc(&rrdev->nr_pending);	1399	atomic_inc(&rrdev->nr_pending);
@@ -1444,69 +1449,71 @@ retry_write:
1444	for (i = 0; i < conf->copies; i++) {	1449	for (i = 0; i < conf->copies; i++) {
1445	struct bio *mbio;	1450	struct bio *mbio;
1446	int d = r10_bio->devs[i].devnum;	1451	int d = r10_bio->devs[i].devnum;
1447	if (!r10_bio->devs[i].bio)	1452	if (r10_bio->devs[i].bio) {
1448	continue;	1453	struct md_rdev *rdev = conf->mirrors[d].rdev;
		1454	mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
		1455	md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
		1456	max_sectors);
		1457	r10_bio->devs[i].bio = mbio;
		1458
		1459	mbio->bi_sector = (r10_bio->devs[i].addr+
		1460	choose_data_offset(r10_bio,
		1461	rdev));
		1462	mbio->bi_bdev = rdev->bdev;
		1463	mbio->bi_end_io = raid10_end_write_request;
		1464	mbio->bi_rw = WRITE \| do_sync \| do_fua \| do_discard;
		1465	mbio->bi_private = r10_bio;
1449		1466
1450	mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);	1467	atomic_inc(&r10_bio->remaining);
1451	md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1452	max_sectors);
1453	r10_bio->devs[i].bio = mbio;
1454		1468
1455	mbio->bi_sector = (r10_bio->devs[i].addr+	1469	cb = blk_check_plugged(raid10_unplug, mddev,
1456	choose_data_offset(r10_bio,	1470	sizeof(*plug));
1457	conf->mirrors[d].rdev));	1471	if (cb)
1458	mbio->bi_bdev = conf->mirrors[d].rdev->bdev;	1472	plug = container_of(cb, struct raid10_plug_cb,
1459	mbio->bi_end_io = raid10_end_write_request;	1473	cb);
1460	mbio->bi_rw = WRITE \| do_sync \| do_fua \| do_discard;	1474	else
1461	mbio->bi_private = r10_bio;	1475	plug = NULL;
		1476	spin_lock_irqsave(&conf->device_lock, flags);
		1477	if (plug) {
		1478	bio_list_add(&plug->pending, mbio);
		1479	plug->pending_cnt++;
		1480	} else {
		1481	bio_list_add(&conf->pending_bio_list, mbio);
		1482	conf->pending_count++;
		1483	}
		1484	spin_unlock_irqrestore(&conf->device_lock, flags);
		1485	if (!plug)
		1486	md_wakeup_thread(mddev->thread);
		1487	}
1462		1488
1463	atomic_inc(&r10_bio->remaining);	1489	if (r10_bio->devs[i].repl_bio) {
		1490	struct md_rdev *rdev = conf->mirrors[d].replacement;
		1491	if (rdev == NULL) {
		1492	/* Replacement just got moved to main 'rdev' */
		1493	smp_mb();
		1494	rdev = conf->mirrors[d].rdev;
		1495	}
		1496	mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
		1497	md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
		1498	max_sectors);
		1499	r10_bio->devs[i].repl_bio = mbio;
		1500
		1501	mbio->bi_sector = (r10_bio->devs[i].addr +
		1502	choose_data_offset(
		1503	r10_bio, rdev));
		1504	mbio->bi_bdev = rdev->bdev;
		1505	mbio->bi_end_io = raid10_end_write_request;
		1506	mbio->bi_rw = WRITE \| do_sync \| do_fua \| do_discard;
		1507	mbio->bi_private = r10_bio;
1464		1508
1465	cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));	1509	atomic_inc(&r10_bio->remaining);
1466	if (cb)	1510	spin_lock_irqsave(&conf->device_lock, flags);
1467	plug = container_of(cb, struct raid10_plug_cb, cb);
1468	else
1469	plug = NULL;
1470	spin_lock_irqsave(&conf->device_lock, flags);
1471	if (plug) {
1472	bio_list_add(&plug->pending, mbio);
1473	plug->pending_cnt++;
1474	} else {
1475	bio_list_add(&conf->pending_bio_list, mbio);	1511	bio_list_add(&conf->pending_bio_list, mbio);
1476	conf->pending_count++;	1512	conf->pending_count++;
		1513	spin_unlock_irqrestore(&conf->device_lock, flags);
		1514	if (!mddev_check_plugged(mddev))
		1515	md_wakeup_thread(mddev->thread);
1477	}	1516	}
1478	spin_unlock_irqrestore(&conf->device_lock, flags);
1479	if (!plug)
1480	md_wakeup_thread(mddev->thread);
1481
1482	if (!r10_bio->devs[i].repl_bio)
1483	continue;
1484
1485	mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1486	md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1487	max_sectors);
1488	r10_bio->devs[i].repl_bio = mbio;
1489
1490	/* We are actively writing to the original device
1491	* so it cannot disappear, so the replacement cannot
1492	* become NULL here
1493	*/
1494	mbio->bi_sector = (r10_bio->devs[i].addr +
1495	choose_data_offset(
1496	r10_bio,
1497	conf->mirrors[d].replacement));
1498	mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
1499	mbio->bi_end_io = raid10_end_write_request;
1500	mbio->bi_rw = WRITE \| do_sync \| do_fua \| do_discard;
1501	mbio->bi_private = r10_bio;
1502
1503	atomic_inc(&r10_bio->remaining);
1504	spin_lock_irqsave(&conf->device_lock, flags);
1505	bio_list_add(&conf->pending_bio_list, mbio);
1506	conf->pending_count++;
1507	spin_unlock_irqrestore(&conf->device_lock, flags);
1508	if (!mddev_check_plugged(mddev))
1509	md_wakeup_thread(mddev->thread);
1510	}	1517	}
1511		1518
1512	/* Don't remove the bias on 'remaining' (one_write_done) until	1519	/* Don't remove the bias on 'remaining' (one_write_done) until


diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c5439dce0295..a4502686e7a8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c
@@ -2774,10 +2774,12 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2774	dev = &sh->dev[i];	2774	dev = &sh->dev[i];
2775	if (!test_bit(R5_LOCKED, &dev->flags) &&	2775	if (!test_bit(R5_LOCKED, &dev->flags) &&
2776	(test_bit(R5_UPTODATE, &dev->flags) \|\|	2776	(test_bit(R5_UPTODATE, &dev->flags) \|\|
2777	test_and_clear_bit(R5_Discard, &dev->flags))) {	2777	test_bit(R5_Discard, &dev->flags))) {
2778	/* We can return any write requests */	2778	/* We can return any write requests */
2779	struct bio wbi, wbi2;	2779	struct bio wbi, wbi2;
2780	pr_debug("Return write for disc %d\n", i);	2780	pr_debug("Return write for disc %d\n", i);
		2781	if (test_and_clear_bit(R5_Discard, &dev->flags))
		2782	clear_bit(R5_UPTODATE, &dev->flags);
2781	wbi = dev->written;	2783	wbi = dev->written;
2782	dev->written = NULL;	2784	dev->written = NULL;
2783	while (wbi && wbi->bi_sector <	2785	while (wbi && wbi->bi_sector <
@@ -2795,7 +2797,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2795	!test_bit(STRIPE_DEGRADED, &sh->state),	2797	!test_bit(STRIPE_DEGRADED, &sh->state),
2796	0);	2798	0);
2797	}	2799	}
2798	}	2800	} else if (test_bit(R5_Discard, &sh->dev[i].flags))
		2801	clear_bit(R5_Discard, &sh->dev[i].flags);
2799		2802
2800	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))	2803	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2801	if (atomic_dec_and_test(&conf->pending_full_writes))	2804	if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -3490,40 +3493,6 @@ static void handle_stripe(struct stripe_head *sh)
3490	handle_failed_sync(conf, sh, &s);	3493	handle_failed_sync(conf, sh, &s);
3491	}	3494	}
3492		3495
3493	/*
3494	* might be able to return some write requests if the parity blocks
3495	* are safe, or on a failed drive
3496	*/
3497	pdev = &sh->dev[sh->pd_idx];
3498	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3499	\|\| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3500	qdev = &sh->dev[sh->qd_idx];
3501	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3502	\|\| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3503	\|\| conf->level < 6;
3504
3505	if (s.written &&
3506	(s.p_failed \|\| ((test_bit(R5_Insync, &pdev->flags)
3507	&& !test_bit(R5_LOCKED, &pdev->flags)
3508	&& (test_bit(R5_UPTODATE, &pdev->flags) \|\|
3509	test_bit(R5_Discard, &pdev->flags))))) &&
3510	(s.q_failed \|\| ((test_bit(R5_Insync, &qdev->flags)
3511	&& !test_bit(R5_LOCKED, &qdev->flags)
3512	&& (test_bit(R5_UPTODATE, &qdev->flags) \|\|
3513	test_bit(R5_Discard, &qdev->flags))))))
3514	handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3515
3516	/* Now we might consider reading some blocks, either to check/generate
3517	* parity, or to satisfy requests
3518	* or to load a block that is being partially written.
3519	*/
3520	if (s.to_read \|\| s.non_overwrite
3521	\|\| (conf->level == 6 && s.to_write && s.failed)
3522	\|\| (s.syncing && (s.uptodate + s.compute < disks))
3523	\|\| s.replacing
3524	\|\| s.expanding)
3525	handle_stripe_fill(sh, &s, disks);
3526
3527	/* Now we check to see if any write operations have recently	3496	/* Now we check to see if any write operations have recently
3528	* completed	3497	* completed
3529	*/	3498	*/
@@ -3561,6 +3530,40 @@ static void handle_stripe(struct stripe_head *sh)
3561	s.dec_preread_active = 1;	3530	s.dec_preread_active = 1;
3562	}	3531	}
3563		3532
		3533	/*
		3534	* might be able to return some write requests if the parity blocks
		3535	* are safe, or on a failed drive
		3536	*/
		3537	pdev = &sh->dev[sh->pd_idx];
		3538	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
		3539	\|\| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
		3540	qdev = &sh->dev[sh->qd_idx];
		3541	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
		3542	\|\| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
		3543	\|\| conf->level < 6;
		3544
		3545	if (s.written &&
		3546	(s.p_failed \|\| ((test_bit(R5_Insync, &pdev->flags)
		3547	&& !test_bit(R5_LOCKED, &pdev->flags)
		3548	&& (test_bit(R5_UPTODATE, &pdev->flags) \|\|
		3549	test_bit(R5_Discard, &pdev->flags))))) &&
		3550	(s.q_failed \|\| ((test_bit(R5_Insync, &qdev->flags)
		3551	&& !test_bit(R5_LOCKED, &qdev->flags)
		3552	&& (test_bit(R5_UPTODATE, &qdev->flags) \|\|
		3553	test_bit(R5_Discard, &qdev->flags))))))
		3554	handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
		3555
		3556	/* Now we might consider reading some blocks, either to check/generate
		3557	* parity, or to satisfy requests
		3558	* or to load a block that is being partially written.
		3559	*/
		3560	if (s.to_read \|\| s.non_overwrite
		3561	\|\| (conf->level == 6 && s.to_write && s.failed)
		3562	\|\| (s.syncing && (s.uptodate + s.compute < disks))
		3563	\|\| s.replacing
		3564	\|\| s.expanding)
		3565	handle_stripe_fill(sh, &s, disks);
		3566
3564	/* Now to consider new write requests and what else, if anything	3567	/* Now to consider new write requests and what else, if anything
3565	* should be read. We do not handle new writes when:	3568	* should be read. We do not handle new writes when:
3566	* 1/ A 'write' operation (copy+xor) is already in flight.	3569	* 1/ A 'write' operation (copy+xor) is already in flight.
@@ -5529,6 +5532,10 @@ static int run(struct mddev *mddev)
5529	* discard data disk but write parity disk	5532	* discard data disk but write parity disk
5530	*/	5533	*/
5531	stripe = stripe * PAGE_SIZE;	5534	stripe = stripe * PAGE_SIZE;
		5535	/* Round up to power of 2, as discard handling
		5536	* currently assumes that */
		5537	while ((stripe-1) & stripe)
		5538	stripe = (stripe \| (stripe-1)) + 1;
5532	mddev->queue->limits.discard_alignment = stripe;	5539	mddev->queue->limits.discard_alignment = stripe;
5533	mddev->queue->limits.discard_granularity = stripe;	5540	mddev->queue->limits.discard_granularity = stripe;
5534	/*	5541	/*