Merge tag 'v3.6-rc4'

Merge 3.6-rc4 to get latest OMAP and device tree fixes.
author: Tomi Valkeinen <tomi.valkeinen@ti.com> 2012-09-03 02:26:33 -0400
committer: Tomi Valkeinen <tomi.valkeinen@ti.com> 2012-09-03 02:26:33 -0400
commit: c50e86ce7c2961a41f2f7aa6e4fd6c99229ba205 (patch)
tree: 4ea36009719bd8fc523239fe1bdccb90f0dce3ae /drivers/md/raid1.c
parent: 14d33d384693eb6083396199de516fdef320f7af (diff)
parent: 4cbe5a555fa58a79b6ecbb6c531b8bab0650778d (diff)
1 files changed, 188 insertions, 56 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a9c7981ddd24..611b5f797618 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
 */
 #define NR_RAID1_BIOS 256
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context.  So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
 /* When there are this many requests queue to be written by
 * the raid1 thread, we become 'congested' to provide back-pressure
 * for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
        const sector_t this_sector = r1_bio->sector;
        int sectors;
        int best_good_sectors;
-        int start_disk;
+        int best_disk, best_dist_disk, best_pending_disk;
-        int best_disk;
+        int has_nonrot_disk;
-        int i;
+        int disk;
        sector_t best_dist;
+        unsigned int min_pending;
        struct md_rdev *rdev;
        int choose_first;
+        int choose_next_idle;
        rcu_read_lock();
        /*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 retry:
        sectors = r1_bio->sectors;
        best_disk = -1;
+        best_dist_disk = -1;
        best_dist = MaxSector;
+        best_pending_disk = -1;
+        min_pending = UINT_MAX;
        best_good_sectors = 0;
+        has_nonrot_disk = 0;
+        choose_next_idle = 0;
        if (conf->mddev->recovery_cp < MaxSector &&
-            (this_sector + sectors >= conf->next_resync)) {
+            (this_sector + sectors >= conf->next_resync))
                choose_first = 1;
-                start_disk = 0;
+        else
-        } else {
                choose_first = 0;
-                start_disk = conf->last_used;
-        }
-        for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
+        for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
                sector_t dist;
                sector_t first_bad;
                int bad_sectors;
+                unsigned int pending;
-                int disk = start_disk + i;
+                bool nonrot;
-                if (disk >= conf->raid_disks)
-                        disk -= conf->raid_disks;
                rdev = rcu_dereference(conf->mirrors[disk].rdev);
                if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                } else
                        best_good_sectors = sectors;
+                nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+                has_nonrot_disk |= nonrot;
+                pending = atomic_read(&rdev->nr_pending);
                dist = abs(this_sector - conf->mirrors[disk].head_position);
-                if (choose_first
+                if (choose_first) {
-                    /* Don't change to another disk for sequential reads */
-                    || conf->next_seq_sect == this_sector
-                    || dist == 0
-                    /* If device is idle, use it */
-                    || atomic_read(&rdev->nr_pending) == 0) {
                        best_disk = disk;
                        break;
                }
+                /* Don't change to another disk for sequential reads */
+                if (conf->mirrors[disk].next_seq_sect == this_sector
+                    || dist == 0) {
+                        int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
+                        struct raid1_info *mirror = &conf->mirrors[disk];
+                        best_disk = disk;
+                        /*
+                         * If buffered sequential IO size exceeds optimal
+                         * iosize, check if there is idle disk. If yes, choose
+                         * the idle disk. read_balance could already choose an
+                         * idle disk before noticing it's a sequential IO in
+                         * this disk. This doesn't matter because this disk
+                         * will idle, next time it will be utilized after the
+                         * first disk has IO size exceeds optimal iosize. In
+                         * this way, iosize of the first disk will be optimal
+                         * iosize at least. iosize of the second disk might be
+                         * small, but not a big deal since when the second disk
+                         * starts IO, the first disk is likely still busy.
+                         */
+                        if (nonrot && opt_iosize > 0 &&
+                            mirror->seq_start != MaxSector &&
+                            mirror->next_seq_sect > opt_iosize &&
+                            mirror->next_seq_sect - opt_iosize >=
+                            mirror->seq_start) {
+                                choose_next_idle = 1;
+                                continue;
+                        }
+                        break;
+                }
+                /* If device is idle, use it */
+                if (pending == 0) {
+                        best_disk = disk;
+                        break;
+                }
+                if (choose_next_idle)
+                        continue;
+                if (min_pending > pending) {
+                        min_pending = pending;
+                        best_pending_disk = disk;
+                }
                if (dist < best_dist) {
                        best_dist = dist;
-                        best_disk = disk;
+                        best_dist_disk = disk;
                }
        }
+        /*
+         * If all disks are rotational, choose the closest disk. If any disk is
+         * non-rotational, choose the disk with less pending request even the
+         * disk is rotational, which might/might not be optimal for raids with
+         * mixed ratation/non-rotational disks depending on workload.
+         */
+        if (best_disk == -1) {
+                if (has_nonrot_disk)
+                        best_disk = best_pending_disk;
+                else
+                        best_disk = best_dist_disk;
+        }
        if (best_disk >= 0) {
                rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
                if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                        goto retry;
                }
                sectors = best_good_sectors;
-                conf->next_seq_sect = this_sector + sectors;
-                conf->last_used = best_disk;
+                if (conf->mirrors[best_disk].next_seq_sect != this_sector)
+                        conf->mirrors[best_disk].seq_start = this_sector;
+                conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
        }
        rcu_read_unlock();
        *max_sectors = sectors;
@@ -870,10 +944,48 @@ do_sync_io:
        pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 }
+struct raid1_plug_cb {
+        struct blk_plug_cb      cb;
+        struct bio_list         pending;
+        int                     pending_cnt;
+};
+static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+        struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
+                                                  cb);
+        struct mddev *mddev = plug->cb.data;
+        struct r1conf *conf = mddev->private;
+        struct bio *bio;
+        if (from_schedule) {
+                spin_lock_irq(&conf->device_lock);
+                bio_list_merge(&conf->pending_bio_list, &plug->pending);
+                conf->pending_count += plug->pending_cnt;
+                spin_unlock_irq(&conf->device_lock);
+                md_wakeup_thread(mddev->thread);
+                kfree(plug);
+                return;
+        }
+        /* we aren't scheduling, so we can do the write-out directly. */
+        bio = bio_list_get(&plug->pending);
+        bitmap_unplug(mddev->bitmap);
+        wake_up(&conf->wait_barrier);
+        while (bio) { /* submit pending writes */
+                struct bio *next = bio->bi_next;
+                bio->bi_next = NULL;
+                generic_make_request(bio);
+                bio = next;
+        }
+        kfree(plug);
+}
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
        struct r1conf *conf = mddev->private;
-        struct mirror_info *mirror;
+        struct raid1_info *mirror;
        struct r1bio *r1_bio;
        struct bio *read_bio;
        int i, disks;
@@ -883,7 +995,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
        const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
        struct md_rdev *blocked_rdev;
-        int plugged;
+        struct blk_plug_cb *cb;
+        struct raid1_plug_cb *plug = NULL;
        int first_clone;
        int sectors_handled;
        int max_sectors;
@@ -1034,7 +1147,6 @@ read_again:
         * the bad blocks.  Each set of writes gets it's own r1bio
         * with a set of bios attached.
         */
-        plugged = mddev_check_plugged(mddev);
        disks = conf->raid_disks * 2;
 retry_write:
@@ -1187,10 +1299,23 @@ read_again:
                mbio->bi_private = r1_bio;
                atomic_inc(&r1_bio->remaining);
+                cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
+                if (cb)
+                        plug = container_of(cb, struct raid1_plug_cb, cb);
+                else
+                        plug = NULL;
                spin_lock_irqsave(&conf->device_lock, flags);
-                bio_list_add(&conf->pending_bio_list, mbio);
+                if (plug) {
-                conf->pending_count++;
+                        bio_list_add(&plug->pending, mbio);
+                        plug->pending_cnt++;
+                } else {
+                        bio_list_add(&conf->pending_bio_list, mbio);
+                        conf->pending_count++;
+                }
                spin_unlock_irqrestore(&conf->device_lock, flags);
+                if (!plug)
+                        md_wakeup_thread(mddev->thread);
        }
        /* Mustn't call r1_bio_write_done before this next test,
         * as it could result in the bio being freed.
@@ -1213,9 +1338,6 @@ read_again:
        /* In case raid1d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
-        if (do_sync || !bitmap || !plugged)
-                md_wakeup_thread(mddev->thread);
 }
 static void status(struct seq_file *seq, struct mddev *mddev)
@@ -1367,7 +1489,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        struct r1conf *conf = mddev->private;
        int err = -EEXIST;
        int mirror = 0;
-        struct mirror_info *p;
+        struct raid1_info *p;
        int first = 0;
        int last = conf->raid_disks - 1;
        struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1436,7 +1558,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
        struct r1conf *conf = mddev->private;
        int err = 0;
        int number = rdev->raid_disk;
-        struct mirror_info *p = conf->mirrors+ number;
+        struct raid1_info *p = conf->mirrors + number;
        if (rdev != p->rdev)
                p = conf->mirrors + conf->raid_disks + number;
@@ -1821,8 +1943,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
        if (atomic_dec_and_test(&r1_bio->remaining)) {
                /* if we're here, all write(s) have completed, so clean up */
-                md_done_sync(mddev, r1_bio->sectors, 1);
+                int s = r1_bio->sectors;
-                put_buf(r1_bio);
+                if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+                    test_bit(R1BIO_WriteError, &r1_bio->state))
+                        reschedule_retry(r1_bio);
+                else {
+                        put_buf(r1_bio);
+                        md_done_sync(mddev, s, 1);
+                }
        }
 }
@@ -2170,8 +2298,7 @@ static void raid1d(struct mddev *mddev)
        blk_start_plug(&plug);
        for (;;) {
-                if (atomic_read(&mddev->plug_cnt) == 0)
+                flush_pending_writes(conf);
-                        flush_pending_writes(conf);
                spin_lock_irqsave(&conf->device_lock, flags);
                if (list_empty(head)) {
@@ -2368,6 +2495,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                                bio->bi_rw = READ;
                                bio->bi_end_io = end_sync_read;
                                read_targets++;
+                        } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+                                test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+                                !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
+                                /*
+                                 * The device is suitable for reading (InSync),
+                                 * but has bad block(s) here. Let's try to correct them,
+                                 * if we are doing resync or repair. Otherwise, leave
+                                 * this device alone for this sync request.
+                                 */
+                                bio->bi_rw = WRITE;
+                                bio->bi_end_io = end_sync_write;
+                                write_targets++;
                        }
                }
                if (bio->bi_end_io) {
@@ -2425,7 +2564,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                /* There is nowhere to write, so all non-sync
                 * drives must be failed - so we are finished
                 */
-                sector_t rv = max_sector - sector_nr;
+                sector_t rv;
+                if (min_bad > 0)
+                        max_sector = sector_nr + min_bad;
+                rv = max_sector - sector_nr;
                *skipped = 1;
                put_buf(r1_bio);
                return rv;
@@ -2488,9 +2630,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
         */
        if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                atomic_set(&r1_bio->remaining, read_targets);
-                for (i = 0; i < conf->raid_disks * 2; i++) {
+                for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
                        bio = r1_bio->bios[i];
                        if (bio->bi_end_io == end_sync_read) {
+                                read_targets--;
                                md_sync_acct(bio->bi_bdev, nr_sectors);
                                generic_make_request(bio);
                        }
@@ -2517,7 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 {
        struct r1conf *conf;
        int i;
-        struct mirror_info *disk;
+        struct raid1_info *disk;
        struct md_rdev *rdev;
        int err = -ENOMEM;
@@ -2525,7 +2668,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        if (!conf)
                goto abort;
-        conf->mirrors = kzalloc(sizeof(struct mirror_info)
+        conf->mirrors = kzalloc(sizeof(struct raid1_info)
                                * mddev->raid_disks * 2,
                                 GFP_KERNEL);
        if (!conf->mirrors)
@@ -2568,6 +2711,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
                        mddev->merge_check_needed = 1;
                disk->head_position = 0;
+                disk->seq_start = MaxSector;
        }
        conf->raid_disks = mddev->raid_disks;
        conf->mddev = mddev;
@@ -2581,7 +2725,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->recovery_disabled = mddev->recovery_disabled - 1;
        err = -EIO;
-        conf->last_used = -1;
        for (i = 0; i < conf->raid_disks * 2; i++) {
                disk = conf->mirrors + i;
@@ -2607,21 +2750,11 @@ static struct r1conf *setup_conf(struct mddev *mddev)
                        if (disk->rdev &&
                            (disk->rdev->saved_raid_disk < 0))
                                conf->fullsync = 1;
-                } else if (conf->last_used < 0)
+                }
-                        /*
-                         * The first working device is used as a
-                         * starting point to read balancing.
-                         */
-                        conf->last_used = i;
        }
-        if (conf->last_used < 0) {
-                printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
-                       mdname(mddev));
-                goto abort;
-        }
        err = -ENOMEM;
-        conf->thread = md_register_thread(raid1d, mddev, NULL);
+        conf->thread = md_register_thread(raid1d, mddev, "raid1");
        if (!conf->thread) {
                printk(KERN_ERR
                       "md/raid1:%s: couldn't allocate thread\n",
@@ -2794,7 +2927,7 @@ static int raid1_reshape(struct mddev *mddev)
         */
        mempool_t *newpool, *oldpool;
        struct pool_info *newpoolinfo;
-        struct mirror_info *newmirrors;
+        struct raid1_info *newmirrors;
        struct r1conf *conf = mddev->private;
        int cnt, raid_disks;
        unsigned long flags;
@@ -2837,7 +2970,7 @@ static int raid1_reshape(struct mddev *mddev)
                kfree(newpoolinfo);
                return -ENOMEM;
        }
-        newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
+        newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
                             GFP_KERNEL);
        if (!newmirrors) {
                kfree(newpoolinfo);
@@ -2876,7 +3009,6 @@ static int raid1_reshape(struct mddev *mddev)
        conf->raid_disks = mddev->raid_disks = raid_disks;
        mddev->delta_disks = 0;
-        conf->last_used = 0; /* just make sure it is in-range */
        lower_barrier(conf);
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
author	Tomi Valkeinen <tomi.valkeinen@ti.com>	2012-09-03 02:26:33 -0400
committer	Tomi Valkeinen <tomi.valkeinen@ti.com>	2012-09-03 02:26:33 -0400
commit	c50e86ce7c2961a41f2f7aa6e4fd6c99229ba205 (patch)
tree	4ea36009719bd8fc523239fe1bdccb90f0dce3ae /drivers/md/raid1.c
parent	14d33d384693eb6083396199de516fdef320f7af (diff)
parent	4cbe5a555fa58a79b6ecbb6c531b8bab0650778d (diff)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a9c7981ddd24..611b5f797618 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
46	*/	46	*/
47	#define NR_RAID1_BIOS 256	47	#define NR_RAID1_BIOS 256
48		48
		49	/* when we get a read error on a read-only array, we redirect to another
		50	* device without failing the first device, or trying to over-write to
		51	* correct the read error. To keep track of bad blocks on a per-bio
		52	* level, we store IO_BLOCKED in the appropriate 'bios' pointer
		53	*/
		54	#define IO_BLOCKED ((struct bio *)1)
		55	/* When we successfully write to a known bad-block, we need to remove the
		56	* bad-block marking which must be done from process context. So we record
		57	* the success by setting devs[n].bio to IO_MADE_GOOD
		58	*/
		59	#define IO_MADE_GOOD ((struct bio *)2)
		60
		61	#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
		62
49	/* When there are this many requests queue to be written by	63	/* When there are this many requests queue to be written by
50	* the raid1 thread, we become 'congested' to provide back-pressure	64	* the raid1 thread, we become 'congested' to provide back-pressure
51	* for writeback.	65	* for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf conf, struct r1bio r1_bio, int *max_sect
483	const sector_t this_sector = r1_bio->sector;	497	const sector_t this_sector = r1_bio->sector;
484	int sectors;	498	int sectors;
485	int best_good_sectors;	499	int best_good_sectors;
486	int start_disk;	500	int best_disk, best_dist_disk, best_pending_disk;
487	int best_disk;	501	int has_nonrot_disk;
488	int i;	502	int disk;
489	sector_t best_dist;	503	sector_t best_dist;
		504	unsigned int min_pending;
490	struct md_rdev *rdev;	505	struct md_rdev *rdev;
491	int choose_first;	506	int choose_first;
		507	int choose_next_idle;
492		508
493	rcu_read_lock();	509	rcu_read_lock();
494	/*	510	/*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf conf, struct r1bio r1_bio, int *max_sect
499	retry:	515	retry:
500	sectors = r1_bio->sectors;	516	sectors = r1_bio->sectors;
501	best_disk = -1;	517	best_disk = -1;
		518	best_dist_disk = -1;
502	best_dist = MaxSector;	519	best_dist = MaxSector;
		520	best_pending_disk = -1;
		521	min_pending = UINT_MAX;
503	best_good_sectors = 0;	522	best_good_sectors = 0;
		523	has_nonrot_disk = 0;
		524	choose_next_idle = 0;
504		525
505	if (conf->mddev->recovery_cp < MaxSector &&	526	if (conf->mddev->recovery_cp < MaxSector &&
506	(this_sector + sectors >= conf->next_resync)) {	527	(this_sector + sectors >= conf->next_resync))
507	choose_first = 1;	528	choose_first = 1;
508	start_disk = 0;	529	else
509	} else {
510	choose_first = 0;	530	choose_first = 0;
511	start_disk = conf->last_used;
512	}
513		531
514	for (i = 0 ; i < conf->raid_disks * 2 ; i++) {	532	for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
515	sector_t dist;	533	sector_t dist;
516	sector_t first_bad;	534	sector_t first_bad;
517	int bad_sectors;	535	int bad_sectors;
518		536	unsigned int pending;
519	int disk = start_disk + i;	537	bool nonrot;
520	if (disk >= conf->raid_disks)
521	disk -= conf->raid_disks;
522		538
523	rdev = rcu_dereference(conf->mirrors[disk].rdev);	539	rdev = rcu_dereference(conf->mirrors[disk].rdev);
524	if (r1_bio->bios[disk] == IO_BLOCKED	540	if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf conf, struct r1bio r1_bio, int *max_sect
577	} else	593	} else
578	best_good_sectors = sectors;	594	best_good_sectors = sectors;
579		595
		596	nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
		597	has_nonrot_disk \|= nonrot;
		598	pending = atomic_read(&rdev->nr_pending);
580	dist = abs(this_sector - conf->mirrors[disk].head_position);	599	dist = abs(this_sector - conf->mirrors[disk].head_position);
581	if (choose_first	600	if (choose_first) {
582	/* Don't change to another disk for sequential reads */
583	\|\| conf->next_seq_sect == this_sector
584	\|\| dist == 0
585	/* If device is idle, use it */
586	\|\| atomic_read(&rdev->nr_pending) == 0) {
587	best_disk = disk;	601	best_disk = disk;
588	break;	602	break;
589	}	603	}
		604	/* Don't change to another disk for sequential reads */
		605	if (conf->mirrors[disk].next_seq_sect == this_sector
		606	\|\| dist == 0) {
		607	int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
		608	struct raid1_info *mirror = &conf->mirrors[disk];
		609
		610	best_disk = disk;
		611	/*
		612	* If buffered sequential IO size exceeds optimal
		613	* iosize, check if there is idle disk. If yes, choose
		614	* the idle disk. read_balance could already choose an
		615	* idle disk before noticing it's a sequential IO in
		616	* this disk. This doesn't matter because this disk
		617	* will idle, next time it will be utilized after the
		618	* first disk has IO size exceeds optimal iosize. In
		619	* this way, iosize of the first disk will be optimal
		620	* iosize at least. iosize of the second disk might be
		621	* small, but not a big deal since when the second disk
		622	* starts IO, the first disk is likely still busy.
		623	*/
		624	if (nonrot && opt_iosize > 0 &&
		625	mirror->seq_start != MaxSector &&
		626	mirror->next_seq_sect > opt_iosize &&
		627	mirror->next_seq_sect - opt_iosize >=
		628	mirror->seq_start) {
		629	choose_next_idle = 1;
		630	continue;
		631	}
		632	break;
		633	}
		634	/* If device is idle, use it */
		635	if (pending == 0) {
		636	best_disk = disk;
		637	break;
		638	}
		639
		640	if (choose_next_idle)
		641	continue;
		642
		643	if (min_pending > pending) {
		644	min_pending = pending;
		645	best_pending_disk = disk;
		646	}
		647
590	if (dist < best_dist) {	648	if (dist < best_dist) {
591	best_dist = dist;	649	best_dist = dist;
592	best_disk = disk;	650	best_dist_disk = disk;
593	}	651	}
594	}	652	}
595		653
		654	/*
		655	* If all disks are rotational, choose the closest disk. If any disk is
		656	* non-rotational, choose the disk with less pending request even the
		657	* disk is rotational, which might/might not be optimal for raids with
		658	* mixed ratation/non-rotational disks depending on workload.
		659	*/
		660	if (best_disk == -1) {
		661	if (has_nonrot_disk)
		662	best_disk = best_pending_disk;
		663	else
		664	best_disk = best_dist_disk;
		665	}
		666
596	if (best_disk >= 0) {	667	if (best_disk >= 0) {
597	rdev = rcu_dereference(conf->mirrors[best_disk].rdev);	668	rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
598	if (!rdev)	669	if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf conf, struct r1bio r1_bio, int *max_sect
606	goto retry;	677	goto retry;
607	}	678	}
608	sectors = best_good_sectors;	679	sectors = best_good_sectors;
609	conf->next_seq_sect = this_sector + sectors;	680
610	conf->last_used = best_disk;	681	if (conf->mirrors[best_disk].next_seq_sect != this_sector)
		682	conf->mirrors[best_disk].seq_start = this_sector;
		683
		684	conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
611	}	685	}
612	rcu_read_unlock();	686	rcu_read_unlock();
613	*max_sectors = sectors;	687	*max_sectors = sectors;
@@ -870,10 +944,48 @@ do_sync_io:
870	pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);	944	pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
871	}	945	}
872		946
		947	struct raid1_plug_cb {
		948	struct blk_plug_cb cb;
		949	struct bio_list pending;
		950	int pending_cnt;
		951	};
		952
		953	static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
		954	{
		955	struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
		956	cb);
		957	struct mddev *mddev = plug->cb.data;
		958	struct r1conf *conf = mddev->private;
		959	struct bio *bio;
		960
		961	if (from_schedule) {
		962	spin_lock_irq(&conf->device_lock);
		963	bio_list_merge(&conf->pending_bio_list, &plug->pending);
		964	conf->pending_count += plug->pending_cnt;
		965	spin_unlock_irq(&conf->device_lock);
		966	md_wakeup_thread(mddev->thread);
		967	kfree(plug);
		968	return;
		969	}
		970
		971	/* we aren't scheduling, so we can do the write-out directly. */
		972	bio = bio_list_get(&plug->pending);
		973	bitmap_unplug(mddev->bitmap);
		974	wake_up(&conf->wait_barrier);
		975
		976	while (bio) { /* submit pending writes */
		977	struct bio *next = bio->bi_next;
		978	bio->bi_next = NULL;
		979	generic_make_request(bio);
		980	bio = next;
		981	}
		982	kfree(plug);
		983	}
		984
873	static void make_request(struct mddev mddev, struct bio bio)	985	static void make_request(struct mddev mddev, struct bio bio)
874	{	986	{
875	struct r1conf *conf = mddev->private;	987	struct r1conf *conf = mddev->private;
876	struct mirror_info *mirror;	988	struct raid1_info *mirror;
877	struct r1bio *r1_bio;	989	struct r1bio *r1_bio;
878	struct bio *read_bio;	990	struct bio *read_bio;
879	int i, disks;	991	int i, disks;
@@ -883,7 +995,8 @@ static void make_request(struct mddev mddev, struct bio bio)
883	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);	995	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
884	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH \| REQ_FUA));	996	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH \| REQ_FUA));
885	struct md_rdev *blocked_rdev;	997	struct md_rdev *blocked_rdev;
886	int plugged;	998	struct blk_plug_cb *cb;
		999	struct raid1_plug_cb *plug = NULL;
887	int first_clone;	1000	int first_clone;
888	int sectors_handled;	1001	int sectors_handled;
889	int max_sectors;	1002	int max_sectors;
@@ -1034,7 +1147,6 @@ read_again:
1034	* the bad blocks. Each set of writes gets it's own r1bio	1147	* the bad blocks. Each set of writes gets it's own r1bio
1035	* with a set of bios attached.	1148	* with a set of bios attached.
1036	*/	1149	*/
1037	plugged = mddev_check_plugged(mddev);
1038		1150
1039	disks = conf->raid_disks * 2;	1151	disks = conf->raid_disks * 2;
1040	retry_write:	1152	retry_write:
@@ -1187,10 +1299,23 @@ read_again:
1187	mbio->bi_private = r1_bio;	1299	mbio->bi_private = r1_bio;
1188		1300
1189	atomic_inc(&r1_bio->remaining);	1301	atomic_inc(&r1_bio->remaining);
		1302
		1303	cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
		1304	if (cb)
		1305	plug = container_of(cb, struct raid1_plug_cb, cb);
		1306	else
		1307	plug = NULL;
1190	spin_lock_irqsave(&conf->device_lock, flags);	1308	spin_lock_irqsave(&conf->device_lock, flags);
1191	bio_list_add(&conf->pending_bio_list, mbio);	1309	if (plug) {
1192	conf->pending_count++;	1310	bio_list_add(&plug->pending, mbio);
		1311	plug->pending_cnt++;
		1312	} else {
		1313	bio_list_add(&conf->pending_bio_list, mbio);
		1314	conf->pending_count++;
		1315	}
1193	spin_unlock_irqrestore(&conf->device_lock, flags);	1316	spin_unlock_irqrestore(&conf->device_lock, flags);
		1317	if (!plug)
		1318	md_wakeup_thread(mddev->thread);
1194	}	1319	}
1195	/* Mustn't call r1_bio_write_done before this next test,	1320	/* Mustn't call r1_bio_write_done before this next test,
1196	* as it could result in the bio being freed.	1321	* as it could result in the bio being freed.
@@ -1213,9 +1338,6 @@ read_again:
1213		1338
1214	/* In case raid1d snuck in to freeze_array */	1339	/* In case raid1d snuck in to freeze_array */
1215	wake_up(&conf->wait_barrier);	1340	wake_up(&conf->wait_barrier);
1216
1217	if (do_sync \|\| !bitmap \|\| !plugged)
1218	md_wakeup_thread(mddev->thread);
1219	}	1341	}
1220		1342
1221	static void status(struct seq_file seq, struct mddev mddev)	1343	static void status(struct seq_file seq, struct mddev mddev)
@@ -1367,7 +1489,7 @@ static int raid1_add_disk(struct mddev mddev, struct md_rdev rdev)
1367	struct r1conf *conf = mddev->private;	1489	struct r1conf *conf = mddev->private;
1368	int err = -EEXIST;	1490	int err = -EEXIST;
1369	int mirror = 0;	1491	int mirror = 0;
1370	struct mirror_info *p;	1492	struct raid1_info *p;
1371	int first = 0;	1493	int first = 0;
1372	int last = conf->raid_disks - 1;	1494	int last = conf->raid_disks - 1;
1373	struct request_queue *q = bdev_get_queue(rdev->bdev);	1495	struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1436,7 +1558,7 @@ static int raid1_remove_disk(struct mddev mddev, struct md_rdev rdev)
1436	struct r1conf *conf = mddev->private;	1558	struct r1conf *conf = mddev->private;
1437	int err = 0;	1559	int err = 0;
1438	int number = rdev->raid_disk;	1560	int number = rdev->raid_disk;
1439	struct mirror_info *p = conf->mirrors+ number;	1561	struct raid1_info *p = conf->mirrors + number;
1440		1562
1441	if (rdev != p->rdev)	1563	if (rdev != p->rdev)
1442	p = conf->mirrors + conf->raid_disks + number;	1564	p = conf->mirrors + conf->raid_disks + number;
@@ -1821,8 +1943,14 @@ static void sync_request_write(struct mddev mddev, struct r1bio r1_bio)
1821		1943
1822	if (atomic_dec_and_test(&r1_bio->remaining)) {	1944	if (atomic_dec_and_test(&r1_bio->remaining)) {
1823	/* if we're here, all write(s) have completed, so clean up */	1945	/* if we're here, all write(s) have completed, so clean up */
1824	md_done_sync(mddev, r1_bio->sectors, 1);	1946	int s = r1_bio->sectors;
1825	put_buf(r1_bio);	1947	if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
		1948	test_bit(R1BIO_WriteError, &r1_bio->state))
		1949	reschedule_retry(r1_bio);
		1950	else {
		1951	put_buf(r1_bio);
		1952	md_done_sync(mddev, s, 1);
		1953	}
1826	}	1954	}
1827	}	1955	}
1828		1956
@@ -2170,8 +2298,7 @@ static void raid1d(struct mddev *mddev)
2170	blk_start_plug(&plug);	2298	blk_start_plug(&plug);
2171	for (;;) {	2299	for (;;) {
2172		2300
2173	if (atomic_read(&mddev->plug_cnt) == 0)	2301	flush_pending_writes(conf);
2174	flush_pending_writes(conf);
2175		2302
2176	spin_lock_irqsave(&conf->device_lock, flags);	2303	spin_lock_irqsave(&conf->device_lock, flags);
2177	if (list_empty(head)) {	2304	if (list_empty(head)) {
@@ -2368,6 +2495,18 @@ static sector_t sync_request(struct mddev mddev, sector_t sector_nr, int skipp
2368	bio->bi_rw = READ;	2495	bio->bi_rw = READ;
2369	bio->bi_end_io = end_sync_read;	2496	bio->bi_end_io = end_sync_read;
2370	read_targets++;	2497	read_targets++;
		2498	} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
		2499	test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
		2500	!test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
		2501	/*
		2502	* The device is suitable for reading (InSync),
		2503	* but has bad block(s) here. Let's try to correct them,
		2504	* if we are doing resync or repair. Otherwise, leave
		2505	* this device alone for this sync request.
		2506	*/
		2507	bio->bi_rw = WRITE;
		2508	bio->bi_end_io = end_sync_write;
		2509	write_targets++;
2371	}	2510	}
2372	}	2511	}
2373	if (bio->bi_end_io) {	2512	if (bio->bi_end_io) {
@@ -2425,7 +2564,10 @@ static sector_t sync_request(struct mddev mddev, sector_t sector_nr, int skipp
2425	/* There is nowhere to write, so all non-sync	2564	/* There is nowhere to write, so all non-sync
2426	* drives must be failed - so we are finished	2565	* drives must be failed - so we are finished
2427	*/	2566	*/
2428	sector_t rv = max_sector - sector_nr;	2567	sector_t rv;
		2568	if (min_bad > 0)
		2569	max_sector = sector_nr + min_bad;
		2570	rv = max_sector - sector_nr;
2429	*skipped = 1;	2571	*skipped = 1;
2430	put_buf(r1_bio);	2572	put_buf(r1_bio);
2431	return rv;	2573	return rv;
@@ -2488,9 +2630,10 @@ static sector_t sync_request(struct mddev mddev, sector_t sector_nr, int skipp
2488	*/	2630	*/
2489	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {	2631	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2490	atomic_set(&r1_bio->remaining, read_targets);	2632	atomic_set(&r1_bio->remaining, read_targets);
2491	for (i = 0; i < conf->raid_disks * 2; i++) {	2633	for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
2492	bio = r1_bio->bios[i];	2634	bio = r1_bio->bios[i];
2493	if (bio->bi_end_io == end_sync_read) {	2635	if (bio->bi_end_io == end_sync_read) {
		2636	read_targets--;
2494	md_sync_acct(bio->bi_bdev, nr_sectors);	2637	md_sync_acct(bio->bi_bdev, nr_sectors);
2495	generic_make_request(bio);	2638	generic_make_request(bio);
2496	}	2639	}
@@ -2517,7 +2660,7 @@ static struct r1conf setup_conf(struct mddev mddev)
2517	{	2660	{
2518	struct r1conf *conf;	2661	struct r1conf *conf;
2519	int i;	2662	int i;
2520	struct mirror_info *disk;	2663	struct raid1_info *disk;
2521	struct md_rdev *rdev;	2664	struct md_rdev *rdev;
2522	int err = -ENOMEM;	2665	int err = -ENOMEM;
2523		2666
@@ -2525,7 +2668,7 @@ static struct r1conf setup_conf(struct mddev mddev)
2525	if (!conf)	2668	if (!conf)
2526	goto abort;	2669	goto abort;
2527		2670
2528	conf->mirrors = kzalloc(sizeof(struct mirror_info)	2671	conf->mirrors = kzalloc(sizeof(struct raid1_info)
2529	* mddev->raid_disks * 2,	2672	* mddev->raid_disks * 2,
2530	GFP_KERNEL);	2673	GFP_KERNEL);
2531	if (!conf->mirrors)	2674	if (!conf->mirrors)
@@ -2568,6 +2711,7 @@ static struct r1conf setup_conf(struct mddev mddev)
2568	mddev->merge_check_needed = 1;	2711	mddev->merge_check_needed = 1;
2569		2712
2570	disk->head_position = 0;	2713	disk->head_position = 0;
		2714	disk->seq_start = MaxSector;
2571	}	2715	}
2572	conf->raid_disks = mddev->raid_disks;	2716	conf->raid_disks = mddev->raid_disks;
2573	conf->mddev = mddev;	2717	conf->mddev = mddev;
@@ -2581,7 +2725,6 @@ static struct r1conf setup_conf(struct mddev mddev)
2581	conf->recovery_disabled = mddev->recovery_disabled - 1;	2725	conf->recovery_disabled = mddev->recovery_disabled - 1;
2582		2726
2583	err = -EIO;	2727	err = -EIO;
2584	conf->last_used = -1;
2585	for (i = 0; i < conf->raid_disks * 2; i++) {	2728	for (i = 0; i < conf->raid_disks * 2; i++) {
2586		2729
2587	disk = conf->mirrors + i;	2730	disk = conf->mirrors + i;
@@ -2607,21 +2750,11 @@ static struct r1conf setup_conf(struct mddev mddev)
2607	if (disk->rdev &&	2750	if (disk->rdev &&
2608	(disk->rdev->saved_raid_disk < 0))	2751	(disk->rdev->saved_raid_disk < 0))
2609	conf->fullsync = 1;	2752	conf->fullsync = 1;
2610	} else if (conf->last_used < 0)	2753	}
2611	/*
2612	* The first working device is used as a
2613	* starting point to read balancing.
2614	*/
2615	conf->last_used = i;
2616	}	2754	}
2617		2755
2618	if (conf->last_used < 0) {
2619	printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2620	mdname(mddev));
2621	goto abort;
2622	}
2623	err = -ENOMEM;	2756	err = -ENOMEM;
2624	conf->thread = md_register_thread(raid1d, mddev, NULL);	2757	conf->thread = md_register_thread(raid1d, mddev, "raid1");
2625	if (!conf->thread) {	2758	if (!conf->thread) {
2626	printk(KERN_ERR	2759	printk(KERN_ERR
2627	"md/raid1:%s: couldn't allocate thread\n",	2760	"md/raid1:%s: couldn't allocate thread\n",
@@ -2794,7 +2927,7 @@ static int raid1_reshape(struct mddev *mddev)
2794	*/	2927	*/
2795	mempool_t newpool, oldpool;	2928	mempool_t newpool, oldpool;
2796	struct pool_info *newpoolinfo;	2929	struct pool_info *newpoolinfo;
2797	struct mirror_info *newmirrors;	2930	struct raid1_info *newmirrors;
2798	struct r1conf *conf = mddev->private;	2931	struct r1conf *conf = mddev->private;
2799	int cnt, raid_disks;	2932	int cnt, raid_disks;
2800	unsigned long flags;	2933	unsigned long flags;
@@ -2837,7 +2970,7 @@ static int raid1_reshape(struct mddev *mddev)
2837	kfree(newpoolinfo);	2970	kfree(newpoolinfo);
2838	return -ENOMEM;	2971	return -ENOMEM;
2839	}	2972	}
2840	newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,	2973	newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
2841	GFP_KERNEL);	2974	GFP_KERNEL);
2842	if (!newmirrors) {	2975	if (!newmirrors) {
2843	kfree(newpoolinfo);	2976	kfree(newpoolinfo);
@@ -2876,7 +3009,6 @@ static int raid1_reshape(struct mddev *mddev)
2876	conf->raid_disks = mddev->raid_disks = raid_disks;	3009	conf->raid_disks = mddev->raid_disks = raid_disks;
2877	mddev->delta_disks = 0;	3010	mddev->delta_disks = 0;
2878		3011
2879	conf->last_used = 0; /* just make sure it is in-range */
2880	lower_barrier(conf);	3012	lower_barrier(conf);
2881		3013
2882	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);	3014	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);