1 files changed, 60 insertions, 27 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 017f58113c33..32389d2f18fc 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -537,7 +537,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
        current_distance = abs(r10_bio->devs[slot].addr -
                               conf->mirrors[disk].head_position);
-        /* Find the disk whose head is closest */
+        /* Find the disk whose head is closest,
+         * or - for far > 1 - find the closest to partition beginning */
        for (nslot = slot; nslot < conf->copies; nslot++) {
                int ndisk = r10_bio->devs[nslot].devnum;
@@ -557,8 +558,13 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
                        slot = nslot;
                        break;
                }
-                new_distance = abs(r10_bio->devs[nslot].addr -
-                                   conf->mirrors[ndisk].head_position);
+                /* for far > 1 always use the lowest address */
+                if (conf->far_copies > 1)
+                        new_distance = r10_bio->devs[nslot].addr;
+                else
+                        new_distance = abs(r10_bio->devs[nslot].addr -
+                                           conf->mirrors[ndisk].head_position);
                if (new_distance < current_distance) {
                        current_distance = new_distance;
                        disk = ndisk;
@@ -629,7 +635,36 @@ static int raid10_congested(void *data, int bits)
        return ret;
 }
+static int flush_pending_writes(conf_t *conf)
+{
+        /* Any writes that have been queued but are awaiting
+         * bitmap updates get flushed here.
+         * We return 1 if any requests were actually submitted.
+         */
+        int rv = 0;
+        spin_lock_irq(&conf->device_lock);
+        if (conf->pending_bio_list.head) {
+                struct bio *bio;
+                bio = bio_list_get(&conf->pending_bio_list);
+                blk_remove_plug(conf->mddev->queue);
+                spin_unlock_irq(&conf->device_lock);
+                /* flush any pending bitmap writes to disk
+                 * before proceeding w/ I/O */
+                bitmap_unplug(conf->mddev->bitmap);
+                while (bio) { /* submit pending writes */
+                        struct bio *next = bio->bi_next;
+                        bio->bi_next = NULL;
+                        generic_make_request(bio);
+                        bio = next;
+                }
+                rv = 1;
+        } else
+                spin_unlock_irq(&conf->device_lock);
+        return rv;
+}
 /* Barriers....
 * Sometimes we need to suspend IO while we do something else,
 * either some resync/recovery, or reconfigure the array.
@@ -712,15 +747,23 @@ static void freeze_array(conf_t *conf)
        /* stop syncio and normal IO and wait for everything to
         * go quiet.
         * We increment barrier and nr_waiting, and then
-         * wait until barrier+nr_pending match nr_queued+2
+         * wait until nr_pending match nr_queued+1
+         * This is called in the context of one normal IO request
+         * that has failed. Thus any sync request that might be pending
+         * will be blocked by nr_pending, and we need to wait for
+         * pending IO requests to complete or be queued for re-try.
+         * Thus the number queued (nr_queued) plus this request (1)
+         * must match the number of pending IOs (nr_pending) before
+         * we continue.
         */
        spin_lock_irq(&conf->resync_lock);
        conf->barrier++;
        conf->nr_waiting++;
        wait_event_lock_irq(conf->wait_barrier,
-                            conf->barrier+conf->nr_pending == conf->nr_queued+2,
+                            conf->nr_pending == conf->nr_queued+1,
                            conf->resync_lock,
-                            raid10_unplug(conf->mddev->queue));
+                            ({ flush_pending_writes(conf);
+                               raid10_unplug(conf->mddev->queue); }));
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -892,6 +935,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
        blk_plug_device(mddev->queue);
        spin_unlock_irqrestore(&conf->device_lock, flags);
+        /* In case raid10d snuck in to freeze_array */
+        wake_up(&conf->wait_barrier);
        if (do_sync)
                md_wakeup_thread(mddev->thread);
@@ -1464,28 +1510,14 @@ static void raid10d(mddev_t *mddev)
        for (;;) {
                char b[BDEVNAME_SIZE];
-                spin_lock_irqsave(&conf->device_lock, flags);
-                if (conf->pending_bio_list.head) {
+                unplug += flush_pending_writes(conf);
-                        bio = bio_list_get(&conf->pending_bio_list);
-                        blk_remove_plug(mddev->queue);
-                        spin_unlock_irqrestore(&conf->device_lock, flags);
-                        /* flush any pending bitmap writes to disk before proceeding w/ I/O */
-                        bitmap_unplug(mddev->bitmap);
-                        while (bio) { /* submit pending writes */
-                                struct bio *next = bio->bi_next;
-                                bio->bi_next = NULL;
-                                generic_make_request(bio);
-                                bio = next;
-                        }
-                        unplug = 1;
-                        continue;
-                }
-                if (list_empty(head))
+                spin_lock_irqsave(&conf->device_lock, flags);
+                if (list_empty(head)) {
+                        spin_unlock_irqrestore(&conf->device_lock, flags);
                        break;
+                }
                r10_bio = list_entry(head->prev, r10bio_t, retry_list);
                list_del(head->prev);
                conf->nr_queued--;
@@ -1548,7 +1580,6 @@ static void raid10d(mddev_t *mddev)
                        }
                }
        }
-        spin_unlock_irqrestore(&conf->device_lock, flags);
        if (unplug)
                unplug_slaves(mddev);
 }
@@ -1787,6 +1818,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                                if (j == conf->copies) {
                                        /* Cannot recover, so abort the recovery */
                                        put_buf(r10_bio);
+                                        if (rb2)
+                                                atomic_dec(&rb2->remaining);
                                        r10_bio = rb2;
                                        if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))
                                                printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 017f58113c33..32389d2f18fc 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -537,7 +537,8 @@ static int read_balance(conf_t conf, r10bio_t r10_bio)
537	current_distance = abs(r10_bio->devs[slot].addr -	537	current_distance = abs(r10_bio->devs[slot].addr -
538	conf->mirrors[disk].head_position);	538	conf->mirrors[disk].head_position);
539		539
540	/* Find the disk whose head is closest */	540	/* Find the disk whose head is closest,
		541	* or - for far > 1 - find the closest to partition beginning */
541		542
542	for (nslot = slot; nslot < conf->copies; nslot++) {	543	for (nslot = slot; nslot < conf->copies; nslot++) {
543	int ndisk = r10_bio->devs[nslot].devnum;	544	int ndisk = r10_bio->devs[nslot].devnum;
@@ -557,8 +558,13 @@ static int read_balance(conf_t conf, r10bio_t r10_bio)
557	slot = nslot;	558	slot = nslot;
558	break;	559	break;
559	}	560	}
560	new_distance = abs(r10_bio->devs[nslot].addr -	561
561	conf->mirrors[ndisk].head_position);	562	/* for far > 1 always use the lowest address */
		563	if (conf->far_copies > 1)
		564	new_distance = r10_bio->devs[nslot].addr;
		565	else
		566	new_distance = abs(r10_bio->devs[nslot].addr -
		567	conf->mirrors[ndisk].head_position);
562	if (new_distance < current_distance) {	568	if (new_distance < current_distance) {
563	current_distance = new_distance;	569	current_distance = new_distance;
564	disk = ndisk;	570	disk = ndisk;
@@ -629,7 +635,36 @@ static int raid10_congested(void *data, int bits)
629	return ret;	635	return ret;
630	}	636	}
631		637
632		638	static int flush_pending_writes(conf_t *conf)
		639	{
		640	/* Any writes that have been queued but are awaiting
		641	* bitmap updates get flushed here.
		642	* We return 1 if any requests were actually submitted.
		643	*/
		644	int rv = 0;
		645
		646	spin_lock_irq(&conf->device_lock);
		647
		648	if (conf->pending_bio_list.head) {
		649	struct bio *bio;
		650	bio = bio_list_get(&conf->pending_bio_list);
		651	blk_remove_plug(conf->mddev->queue);
		652	spin_unlock_irq(&conf->device_lock);
		653	/* flush any pending bitmap writes to disk
		654	* before proceeding w/ I/O */
		655	bitmap_unplug(conf->mddev->bitmap);
		656
		657	while (bio) { /* submit pending writes */
		658	struct bio *next = bio->bi_next;
		659	bio->bi_next = NULL;
		660	generic_make_request(bio);
		661	bio = next;
		662	}
		663	rv = 1;
		664	} else
		665	spin_unlock_irq(&conf->device_lock);
		666	return rv;
		667	}
633	/* Barriers....	668	/* Barriers....
634	* Sometimes we need to suspend IO while we do something else,	669	* Sometimes we need to suspend IO while we do something else,
635	* either some resync/recovery, or reconfigure the array.	670	* either some resync/recovery, or reconfigure the array.
@@ -712,15 +747,23 @@ static void freeze_array(conf_t *conf)
712	/* stop syncio and normal IO and wait for everything to	747	/* stop syncio and normal IO and wait for everything to
713	* go quiet.	748	* go quiet.
714	* We increment barrier and nr_waiting, and then	749	* We increment barrier and nr_waiting, and then
715	* wait until barrier+nr_pending match nr_queued+2	750	* wait until nr_pending match nr_queued+1
		751	* This is called in the context of one normal IO request
		752	* that has failed. Thus any sync request that might be pending
		753	* will be blocked by nr_pending, and we need to wait for
		754	* pending IO requests to complete or be queued for re-try.
		755	* Thus the number queued (nr_queued) plus this request (1)
		756	* must match the number of pending IOs (nr_pending) before
		757	* we continue.
716	*/	758	*/
717	spin_lock_irq(&conf->resync_lock);	759	spin_lock_irq(&conf->resync_lock);
718	conf->barrier++;	760	conf->barrier++;
719	conf->nr_waiting++;	761	conf->nr_waiting++;
720	wait_event_lock_irq(conf->wait_barrier,	762	wait_event_lock_irq(conf->wait_barrier,
721	conf->barrier+conf->nr_pending == conf->nr_queued+2,	763	conf->nr_pending == conf->nr_queued+1,
722	conf->resync_lock,	764	conf->resync_lock,
723	raid10_unplug(conf->mddev->queue));	765	({ flush_pending_writes(conf);
		766	raid10_unplug(conf->mddev->queue); }));
724	spin_unlock_irq(&conf->resync_lock);	767	spin_unlock_irq(&conf->resync_lock);
725	}	768	}
726		769
@@ -892,6 +935,9 @@ static int make_request(struct request_queue q, struct bio bio)
892	blk_plug_device(mddev->queue);	935	blk_plug_device(mddev->queue);
893	spin_unlock_irqrestore(&conf->device_lock, flags);	936	spin_unlock_irqrestore(&conf->device_lock, flags);
894		937
		938	/* In case raid10d snuck in to freeze_array */
		939	wake_up(&conf->wait_barrier);
		940
895	if (do_sync)	941	if (do_sync)
896	md_wakeup_thread(mddev->thread);	942	md_wakeup_thread(mddev->thread);
897		943
@@ -1464,28 +1510,14 @@ static void raid10d(mddev_t *mddev)
1464		1510
1465	for (;;) {	1511	for (;;) {
1466	char b[BDEVNAME_SIZE];	1512	char b[BDEVNAME_SIZE];
1467	spin_lock_irqsave(&conf->device_lock, flags);
1468		1513
1469	if (conf->pending_bio_list.head) {	1514	unplug += flush_pending_writes(conf);
1470	bio = bio_list_get(&conf->pending_bio_list);
1471	blk_remove_plug(mddev->queue);
1472	spin_unlock_irqrestore(&conf->device_lock, flags);
1473	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
1474	bitmap_unplug(mddev->bitmap);
1475
1476	while (bio) { /* submit pending writes */
1477	struct bio *next = bio->bi_next;
1478	bio->bi_next = NULL;
1479	generic_make_request(bio);
1480	bio = next;
1481	}
1482	unplug = 1;
1483
1484	continue;
1485	}
1486		1515
1487	if (list_empty(head))	1516	spin_lock_irqsave(&conf->device_lock, flags);
		1517	if (list_empty(head)) {
		1518	spin_unlock_irqrestore(&conf->device_lock, flags);
1488	break;	1519	break;
		1520	}
1489	r10_bio = list_entry(head->prev, r10bio_t, retry_list);	1521	r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1490	list_del(head->prev);	1522	list_del(head->prev);
1491	conf->nr_queued--;	1523	conf->nr_queued--;
@@ -1548,7 +1580,6 @@ static void raid10d(mddev_t *mddev)
1548	}	1580	}
1549	}	1581	}
1550	}	1582	}
1551	spin_unlock_irqrestore(&conf->device_lock, flags);
1552	if (unplug)	1583	if (unplug)
1553	unplug_slaves(mddev);	1584	unplug_slaves(mddev);
1554	}	1585	}
@@ -1787,6 +1818,8 @@ static sector_t sync_request(mddev_t mddev, sector_t sector_nr, int skipped, i
1787	if (j == conf->copies) {	1818	if (j == conf->copies) {
1788	/* Cannot recover, so abort the recovery */	1819	/* Cannot recover, so abort the recovery */
1789	put_buf(r10_bio);	1820	put_buf(r10_bio);
		1821	if (rb2)
		1822	atomic_dec(&rb2->remaining);
1790	r10_bio = rb2;	1823	r10_bio = rb2;
1791	if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))	1824	if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))
1792	printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",	1825	printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",