md: fix deadlock in md/raid1 and md/raid10 when handling a read error

When handling a read error, we freeze the array to stop any other IO while attempting to over-write with correct data. This is done in the raid1d(raid10d) thread and must wait for all submitted IO to complete (except for requests that failed and are sitting in the retry queue - these are counted in ->nr_queue and will stay there during a freeze). However write requests need attention from raid1d as bitmap updates might be required. This can cause a deadlock as raid1 is waiting for requests to finish that themselves need attention from raid1d. So we create a new function 'flush_pending_writes' to give that attention, and call it in freeze_array to be sure that we aren't waiting on raid1d. Thanks to "K.Tanaka" <k-tanaka@ce.jp.nec.com> for finding and reporting this problem. Cc: "K.Tanaka" <k-tanaka@ce.jp.nec.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: NeilBrown <neilb@suse.de> 2008-03-04 17:29:29 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2008-03-04 19:35:17 -0500
commit: a35e63efa1fb18c6f20f38e3ddf3f8ffbcf0f6e7 (patch)
tree: 8dddd54c45ebaad84a6178765d29d9536df944d1 /drivers/md/raid1.c
parent: 466634488e80968f12e73dd1fe6af5c37a1fbfe2 (diff)
1 files changed, 41 insertions, 21 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5c7fef091cec..38f076a3400d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -592,6 +592,37 @@ static int raid1_congested(void *data, int bits)
 }
+static int flush_pending_writes(conf_t *conf)
+{
+        /* Any writes that have been queued but are awaiting
+         * bitmap updates get flushed here.
+         * We return 1 if any requests were actually submitted.
+         */
+        int rv = 0;
+        spin_lock_irq(&conf->device_lock);
+        if (conf->pending_bio_list.head) {
+                struct bio *bio;
+                bio = bio_list_get(&conf->pending_bio_list);
+                blk_remove_plug(conf->mddev->queue);
+                spin_unlock_irq(&conf->device_lock);
+                /* flush any pending bitmap writes to
+                 * disk before proceeding w/ I/O */
+                bitmap_unplug(conf->mddev->bitmap);
+                while (bio) { /* submit pending writes */
+                        struct bio *next = bio->bi_next;
+                        bio->bi_next = NULL;
+                        generic_make_request(bio);
+                        bio = next;
+                }
+                rv = 1;
+        } else
+                spin_unlock_irq(&conf->device_lock);
+        return rv;
+}
 /* Barriers....
 * Sometimes we need to suspend IO while we do something else,
 * either some resync/recovery, or reconfigure the array.
@@ -681,7 +712,8 @@ static void freeze_array(conf_t *conf)
        wait_event_lock_irq(conf->wait_barrier,
                            conf->barrier+conf->nr_pending == conf->nr_queued+2,
                            conf->resync_lock,
-                            raid1_unplug(conf->mddev->queue));
+                            ({ flush_pending_writes(conf);
+                               raid1_unplug(conf->mddev->queue); }));
        spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(conf_t *conf)
@@ -907,6 +939,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
        blk_plug_device(mddev->queue);
        spin_unlock_irqrestore(&conf->device_lock, flags);
+        /* In case raid1d snuck into freeze_array */
+        wake_up(&conf->wait_barrier);
        if (do_sync)
                md_wakeup_thread(mddev->thread);
 #if 0
@@ -1473,28 +1508,14 @@ static void raid1d(mddev_t *mddev)
        
        for (;;) {
                char b[BDEVNAME_SIZE];
-                spin_lock_irqsave(&conf->device_lock, flags);
-                if (conf->pending_bio_list.head) {
-                        bio = bio_list_get(&conf->pending_bio_list);
-                        blk_remove_plug(mddev->queue);
-                        spin_unlock_irqrestore(&conf->device_lock, flags);
-                        /* flush any pending bitmap writes to disk before proceeding w/ I/O */
-                        bitmap_unplug(mddev->bitmap);
-                        while (bio) { /* submit pending writes */
+                unplug += flush_pending_writes(conf);
-                                struct bio *next = bio->bi_next;
-                                bio->bi_next = NULL;
-                                generic_make_request(bio);
-                                bio = next;
-                        }
-                        unplug = 1;
-                        continue;
+                spin_lock_irqsave(&conf->device_lock, flags);
-                }
+                if (list_empty(head)) {
+                        spin_unlock_irqrestore(&conf->device_lock, flags);
-                if (list_empty(head))
                        break;
+                }
                r1_bio = list_entry(head->prev, r1bio_t, retry_list);
                list_del(head->prev);
                conf->nr_queued--;
@@ -1590,7 +1611,6 @@ static void raid1d(mddev_t *mddev)
                        }
                }
        }
-        spin_unlock_irqrestore(&conf->device_lock, flags);
        if (unplug)
                unplug_slaves(mddev);
 }
author	NeilBrown <neilb@suse.de>	2008-03-04 17:29:29 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2008-03-04 19:35:17 -0500
commit	a35e63efa1fb18c6f20f38e3ddf3f8ffbcf0f6e7 (patch)
tree	8dddd54c45ebaad84a6178765d29d9536df944d1 /drivers/md/raid1.c
parent	466634488e80968f12e73dd1fe6af5c37a1fbfe2 (diff)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5c7fef091cec..38f076a3400d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -592,6 +592,37 @@ static int raid1_congested(void *data, int bits)
592	}	592	}
593		593
594		594
		595	static int flush_pending_writes(conf_t *conf)
		596	{
		597	/* Any writes that have been queued but are awaiting
		598	* bitmap updates get flushed here.
		599	* We return 1 if any requests were actually submitted.
		600	*/
		601	int rv = 0;
		602
		603	spin_lock_irq(&conf->device_lock);
		604
		605	if (conf->pending_bio_list.head) {
		606	struct bio *bio;
		607	bio = bio_list_get(&conf->pending_bio_list);
		608	blk_remove_plug(conf->mddev->queue);
		609	spin_unlock_irq(&conf->device_lock);
		610	/* flush any pending bitmap writes to
		611	* disk before proceeding w/ I/O */
		612	bitmap_unplug(conf->mddev->bitmap);
		613
		614	while (bio) { /* submit pending writes */
		615	struct bio *next = bio->bi_next;
		616	bio->bi_next = NULL;
		617	generic_make_request(bio);
		618	bio = next;
		619	}
		620	rv = 1;
		621	} else
		622	spin_unlock_irq(&conf->device_lock);
		623	return rv;
		624	}
		625
595	/* Barriers....	626	/* Barriers....
596	* Sometimes we need to suspend IO while we do something else,	627	* Sometimes we need to suspend IO while we do something else,
597	* either some resync/recovery, or reconfigure the array.	628	* either some resync/recovery, or reconfigure the array.
@@ -681,7 +712,8 @@ static void freeze_array(conf_t *conf)
681	wait_event_lock_irq(conf->wait_barrier,	712	wait_event_lock_irq(conf->wait_barrier,
682	conf->barrier+conf->nr_pending == conf->nr_queued+2,	713	conf->barrier+conf->nr_pending == conf->nr_queued+2,
683	conf->resync_lock,	714	conf->resync_lock,
684	raid1_unplug(conf->mddev->queue));	715	({ flush_pending_writes(conf);
		716	raid1_unplug(conf->mddev->queue); }));
685	spin_unlock_irq(&conf->resync_lock);	717	spin_unlock_irq(&conf->resync_lock);
686	}	718	}
687	static void unfreeze_array(conf_t *conf)	719	static void unfreeze_array(conf_t *conf)
@@ -907,6 +939,9 @@ static int make_request(struct request_queue q, struct bio bio)
907	blk_plug_device(mddev->queue);	939	blk_plug_device(mddev->queue);
908	spin_unlock_irqrestore(&conf->device_lock, flags);	940	spin_unlock_irqrestore(&conf->device_lock, flags);
909		941
		942	/* In case raid1d snuck into freeze_array */
		943	wake_up(&conf->wait_barrier);
		944
910	if (do_sync)	945	if (do_sync)
911	md_wakeup_thread(mddev->thread);	946	md_wakeup_thread(mddev->thread);
912	#if 0	947	#if 0
@@ -1473,28 +1508,14 @@ static void raid1d(mddev_t *mddev)
1473		1508
1474	for (;;) {	1509	for (;;) {
1475	char b[BDEVNAME_SIZE];	1510	char b[BDEVNAME_SIZE];
1476	spin_lock_irqsave(&conf->device_lock, flags);
1477
1478	if (conf->pending_bio_list.head) {
1479	bio = bio_list_get(&conf->pending_bio_list);
1480	blk_remove_plug(mddev->queue);
1481	spin_unlock_irqrestore(&conf->device_lock, flags);
1482	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
1483	bitmap_unplug(mddev->bitmap);
1484		1511
1485	while (bio) { /* submit pending writes */	1512	unplug += flush_pending_writes(conf);
1486	struct bio *next = bio->bi_next;
1487	bio->bi_next = NULL;
1488	generic_make_request(bio);
1489	bio = next;
1490	}
1491	unplug = 1;
1492		1513
1493	continue;	1514	spin_lock_irqsave(&conf->device_lock, flags);
1494	}	1515	if (list_empty(head)) {
1495		1516	spin_unlock_irqrestore(&conf->device_lock, flags);
1496	if (list_empty(head))
1497	break;	1517	break;
		1518	}
1498	r1_bio = list_entry(head->prev, r1bio_t, retry_list);	1519	r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1499	list_del(head->prev);	1520	list_del(head->prev);
1500	conf->nr_queued--;	1521	conf->nr_queued--;
@@ -1590,7 +1611,6 @@ static void raid1d(mddev_t *mddev)
1590	}	1611	}
1591	}	1612	}
1592	}	1613	}
1593	spin_unlock_irqrestore(&conf->device_lock, flags);
1594	if (unplug)	1614	if (unplug)
1595	unplug_slaves(mddev);	1615	unplug_slaves(mddev);
1596	}	1616	}