md: fix deadlock in md/raid1 and md/raid10 when handling a read error

When handling a read error, we freeze the array to stop any other IO while attempting to over-write with correct data. This is done in the raid1d(raid10d) thread and must wait for all submitted IO to complete (except for requests that failed and are sitting in the retry queue - these are counted in ->nr_queue and will stay there during a freeze). However write requests need attention from raid1d as bitmap updates might be required. This can cause a deadlock as raid1 is waiting for requests to finish that themselves need attention from raid1d. So we create a new function 'flush_pending_writes' to give that attention, and call it in freeze_array to be sure that we aren't waiting on raid1d. Thanks to "K.Tanaka" <k-tanaka@ce.jp.nec.com> for finding and reporting this problem. Cc: "K.Tanaka" <k-tanaka@ce.jp.nec.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: NeilBrown <neilb@suse.de> 2008-03-04 17:29:29 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2008-03-04 19:35:17 -0500
commit: a35e63efa1fb18c6f20f38e3ddf3f8ffbcf0f6e7 (patch)
tree: 8dddd54c45ebaad84a6178765d29d9536df944d1 /drivers/md
parent: 466634488e80968f12e73dd1fe6af5c37a1fbfe2 (diff)
2 files changed, 81 insertions, 43 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5c7fef091cec..38f076a3400d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -592,6 +592,37 @@ static int raid1_congested(void *data, int bits)
 }
+static int flush_pending_writes(conf_t *conf)
+{
+        /* Any writes that have been queued but are awaiting
+         * bitmap updates get flushed here.
+         * We return 1 if any requests were actually submitted.
+         */
+        int rv = 0;
+        spin_lock_irq(&conf->device_lock);
+        if (conf->pending_bio_list.head) {
+                struct bio *bio;
+                bio = bio_list_get(&conf->pending_bio_list);
+                blk_remove_plug(conf->mddev->queue);
+                spin_unlock_irq(&conf->device_lock);
+                /* flush any pending bitmap writes to
+                 * disk before proceeding w/ I/O */
+                bitmap_unplug(conf->mddev->bitmap);
+                while (bio) { /* submit pending writes */
+                        struct bio *next = bio->bi_next;
+                        bio->bi_next = NULL;
+                        generic_make_request(bio);
+                        bio = next;
+                }
+                rv = 1;
+        } else
+                spin_unlock_irq(&conf->device_lock);
+        return rv;
+}
 /* Barriers....
 * Sometimes we need to suspend IO while we do something else,
 * either some resync/recovery, or reconfigure the array.
@@ -681,7 +712,8 @@ static void freeze_array(conf_t *conf)
        wait_event_lock_irq(conf->wait_barrier,
                            conf->barrier+conf->nr_pending == conf->nr_queued+2,
                            conf->resync_lock,
-                            raid1_unplug(conf->mddev->queue));
+                            ({ flush_pending_writes(conf);
+                               raid1_unplug(conf->mddev->queue); }));
        spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(conf_t *conf)
@@ -907,6 +939,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
        blk_plug_device(mddev->queue);
        spin_unlock_irqrestore(&conf->device_lock, flags);
+        /* In case raid1d snuck into freeze_array */
+        wake_up(&conf->wait_barrier);
        if (do_sync)
                md_wakeup_thread(mddev->thread);
 #if 0
@@ -1473,28 +1508,14 @@ static void raid1d(mddev_t *mddev)
        
        for (;;) {
                char b[BDEVNAME_SIZE];
-                spin_lock_irqsave(&conf->device_lock, flags);
-                if (conf->pending_bio_list.head) {
-                        bio = bio_list_get(&conf->pending_bio_list);
-                        blk_remove_plug(mddev->queue);
-                        spin_unlock_irqrestore(&conf->device_lock, flags);
-                        /* flush any pending bitmap writes to disk before proceeding w/ I/O */
-                        bitmap_unplug(mddev->bitmap);
-                        while (bio) { /* submit pending writes */
+                unplug += flush_pending_writes(conf);
-                                struct bio *next = bio->bi_next;
-                                bio->bi_next = NULL;
-                                generic_make_request(bio);
-                                bio = next;
-                        }
-                        unplug = 1;
-                        continue;
+                spin_lock_irqsave(&conf->device_lock, flags);
-                }
+                if (list_empty(head)) {
+                        spin_unlock_irqrestore(&conf->device_lock, flags);
-                if (list_empty(head))
                        break;
+                }
                r1_bio = list_entry(head->prev, r1bio_t, retry_list);
                list_del(head->prev);
                conf->nr_queued--;
@@ -1590,7 +1611,6 @@ static void raid1d(mddev_t *mddev)
                        }
                }
        }
-        spin_unlock_irqrestore(&conf->device_lock, flags);
        if (unplug)
                unplug_slaves(mddev);
 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 017f58113c33..5de42d87bf4e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -629,7 +629,36 @@ static int raid10_congested(void *data, int bits)
        return ret;
 }
+static int flush_pending_writes(conf_t *conf)
+{
+        /* Any writes that have been queued but are awaiting
+         * bitmap updates get flushed here.
+         * We return 1 if any requests were actually submitted.
+         */
+        int rv = 0;
+        spin_lock_irq(&conf->device_lock);
+        if (conf->pending_bio_list.head) {
+                struct bio *bio;
+                bio = bio_list_get(&conf->pending_bio_list);
+                blk_remove_plug(conf->mddev->queue);
+                spin_unlock_irq(&conf->device_lock);
+                /* flush any pending bitmap writes to disk
+                 * before proceeding w/ I/O */
+                bitmap_unplug(conf->mddev->bitmap);
+                while (bio) { /* submit pending writes */
+                        struct bio *next = bio->bi_next;
+                        bio->bi_next = NULL;
+                        generic_make_request(bio);
+                        bio = next;
+                }
+                rv = 1;
+        } else
+                spin_unlock_irq(&conf->device_lock);
+        return rv;
+}
 /* Barriers....
 * Sometimes we need to suspend IO while we do something else,
 * either some resync/recovery, or reconfigure the array.
@@ -720,7 +749,8 @@ static void freeze_array(conf_t *conf)
        wait_event_lock_irq(conf->wait_barrier,
                            conf->barrier+conf->nr_pending == conf->nr_queued+2,
                            conf->resync_lock,
-                            raid10_unplug(conf->mddev->queue));
+                            ({ flush_pending_writes(conf);
+                               raid10_unplug(conf->mddev->queue); }));
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -892,6 +922,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
        blk_plug_device(mddev->queue);
        spin_unlock_irqrestore(&conf->device_lock, flags);
+        /* In case raid10d snuck in to freeze_array */
+        wake_up(&conf->wait_barrier);
        if (do_sync)
                md_wakeup_thread(mddev->thread);
@@ -1464,28 +1497,14 @@ static void raid10d(mddev_t *mddev)
        for (;;) {
                char b[BDEVNAME_SIZE];
-                spin_lock_irqsave(&conf->device_lock, flags);
-                if (conf->pending_bio_list.head) {
+                unplug += flush_pending_writes(conf);
-                        bio = bio_list_get(&conf->pending_bio_list);
-                        blk_remove_plug(mddev->queue);
-                        spin_unlock_irqrestore(&conf->device_lock, flags);
-                        /* flush any pending bitmap writes to disk before proceeding w/ I/O */
-                        bitmap_unplug(mddev->bitmap);
-                        while (bio) { /* submit pending writes */
+                spin_lock_irqsave(&conf->device_lock, flags);
-                                struct bio *next = bio->bi_next;
+                if (list_empty(head)) {
-                                bio->bi_next = NULL;
+                        spin_unlock_irqrestore(&conf->device_lock, flags);
-                                generic_make_request(bio);
-                                bio = next;
-                        }
-                        unplug = 1;
-                        continue;
-                }
-                if (list_empty(head))
                        break;
+                }
                r10_bio = list_entry(head->prev, r10bio_t, retry_list);
                list_del(head->prev);
                conf->nr_queued--;
@@ -1548,7 +1567,6 @@ static void raid10d(mddev_t *mddev)
                        }
                }
        }
-        spin_unlock_irqrestore(&conf->device_lock, flags);
        if (unplug)
                unplug_slaves(mddev);
 }
author	NeilBrown <neilb@suse.de>	2008-03-04 17:29:29 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2008-03-04 19:35:17 -0500
commit	a35e63efa1fb18c6f20f38e3ddf3f8ffbcf0f6e7 (patch)
tree	8dddd54c45ebaad84a6178765d29d9536df944d1 /drivers/md
parent	466634488e80968f12e73dd1fe6af5c37a1fbfe2 (diff)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5c7fef091cec..38f076a3400d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -592,6 +592,37 @@ static int raid1_congested(void *data, int bits)
592	}	592	}
593		593
594		594
		595	static int flush_pending_writes(conf_t *conf)
		596	{
		597	/* Any writes that have been queued but are awaiting
		598	* bitmap updates get flushed here.
		599	* We return 1 if any requests were actually submitted.
		600	*/
		601	int rv = 0;
		602
		603	spin_lock_irq(&conf->device_lock);
		604
		605	if (conf->pending_bio_list.head) {
		606	struct bio *bio;
		607	bio = bio_list_get(&conf->pending_bio_list);
		608	blk_remove_plug(conf->mddev->queue);
		609	spin_unlock_irq(&conf->device_lock);
		610	/* flush any pending bitmap writes to
		611	* disk before proceeding w/ I/O */
		612	bitmap_unplug(conf->mddev->bitmap);
		613
		614	while (bio) { /* submit pending writes */
		615	struct bio *next = bio->bi_next;
		616	bio->bi_next = NULL;
		617	generic_make_request(bio);
		618	bio = next;
		619	}
		620	rv = 1;
		621	} else
		622	spin_unlock_irq(&conf->device_lock);
		623	return rv;
		624	}
		625
595	/* Barriers....	626	/* Barriers....
596	* Sometimes we need to suspend IO while we do something else,	627	* Sometimes we need to suspend IO while we do something else,
597	* either some resync/recovery, or reconfigure the array.	628	* either some resync/recovery, or reconfigure the array.
@@ -681,7 +712,8 @@ static void freeze_array(conf_t *conf)
681	wait_event_lock_irq(conf->wait_barrier,	712	wait_event_lock_irq(conf->wait_barrier,
682	conf->barrier+conf->nr_pending == conf->nr_queued+2,	713	conf->barrier+conf->nr_pending == conf->nr_queued+2,
683	conf->resync_lock,	714	conf->resync_lock,
684	raid1_unplug(conf->mddev->queue));	715	({ flush_pending_writes(conf);
		716	raid1_unplug(conf->mddev->queue); }));
685	spin_unlock_irq(&conf->resync_lock);	717	spin_unlock_irq(&conf->resync_lock);
686	}	718	}
687	static void unfreeze_array(conf_t *conf)	719	static void unfreeze_array(conf_t *conf)
@@ -907,6 +939,9 @@ static int make_request(struct request_queue q, struct bio bio)
907	blk_plug_device(mddev->queue);	939	blk_plug_device(mddev->queue);
908	spin_unlock_irqrestore(&conf->device_lock, flags);	940	spin_unlock_irqrestore(&conf->device_lock, flags);
909		941
		942	/* In case raid1d snuck into freeze_array */
		943	wake_up(&conf->wait_barrier);
		944
910	if (do_sync)	945	if (do_sync)
911	md_wakeup_thread(mddev->thread);	946	md_wakeup_thread(mddev->thread);
912	#if 0	947	#if 0
@@ -1473,28 +1508,14 @@ static void raid1d(mddev_t *mddev)
1473		1508
1474	for (;;) {	1509	for (;;) {
1475	char b[BDEVNAME_SIZE];	1510	char b[BDEVNAME_SIZE];
1476	spin_lock_irqsave(&conf->device_lock, flags);
1477
1478	if (conf->pending_bio_list.head) {
1479	bio = bio_list_get(&conf->pending_bio_list);
1480	blk_remove_plug(mddev->queue);
1481	spin_unlock_irqrestore(&conf->device_lock, flags);
1482	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
1483	bitmap_unplug(mddev->bitmap);
1484		1511
1485	while (bio) { /* submit pending writes */	1512	unplug += flush_pending_writes(conf);
1486	struct bio *next = bio->bi_next;
1487	bio->bi_next = NULL;
1488	generic_make_request(bio);
1489	bio = next;
1490	}
1491	unplug = 1;
1492		1513
1493	continue;	1514	spin_lock_irqsave(&conf->device_lock, flags);
1494	}	1515	if (list_empty(head)) {
1495		1516	spin_unlock_irqrestore(&conf->device_lock, flags);
1496	if (list_empty(head))
1497	break;	1517	break;
		1518	}
1498	r1_bio = list_entry(head->prev, r1bio_t, retry_list);	1519	r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1499	list_del(head->prev);	1520	list_del(head->prev);
1500	conf->nr_queued--;	1521	conf->nr_queued--;
@@ -1590,7 +1611,6 @@ static void raid1d(mddev_t *mddev)
1590	}	1611	}
1591	}	1612	}
1592	}	1613	}
1593	spin_unlock_irqrestore(&conf->device_lock, flags);
1594	if (unplug)	1614	if (unplug)
1595	unplug_slaves(mddev);	1615	unplug_slaves(mddev);
1596	}	1616	}


diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 017f58113c33..5de42d87bf4e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -629,7 +629,36 @@ static int raid10_congested(void *data, int bits)
629	return ret;	629	return ret;
630	}	630	}
631		631
632		632	static int flush_pending_writes(conf_t *conf)
		633	{
		634	/* Any writes that have been queued but are awaiting
		635	* bitmap updates get flushed here.
		636	* We return 1 if any requests were actually submitted.
		637	*/
		638	int rv = 0;
		639
		640	spin_lock_irq(&conf->device_lock);
		641
		642	if (conf->pending_bio_list.head) {
		643	struct bio *bio;
		644	bio = bio_list_get(&conf->pending_bio_list);
		645	blk_remove_plug(conf->mddev->queue);
		646	spin_unlock_irq(&conf->device_lock);
		647	/* flush any pending bitmap writes to disk
		648	* before proceeding w/ I/O */
		649	bitmap_unplug(conf->mddev->bitmap);
		650
		651	while (bio) { /* submit pending writes */
		652	struct bio *next = bio->bi_next;
		653	bio->bi_next = NULL;
		654	generic_make_request(bio);
		655	bio = next;
		656	}
		657	rv = 1;
		658	} else
		659	spin_unlock_irq(&conf->device_lock);
		660	return rv;
		661	}
633	/* Barriers....	662	/* Barriers....
634	* Sometimes we need to suspend IO while we do something else,	663	* Sometimes we need to suspend IO while we do something else,
635	* either some resync/recovery, or reconfigure the array.	664	* either some resync/recovery, or reconfigure the array.
@@ -720,7 +749,8 @@ static void freeze_array(conf_t *conf)
720	wait_event_lock_irq(conf->wait_barrier,	749	wait_event_lock_irq(conf->wait_barrier,
721	conf->barrier+conf->nr_pending == conf->nr_queued+2,	750	conf->barrier+conf->nr_pending == conf->nr_queued+2,
722	conf->resync_lock,	751	conf->resync_lock,
723	raid10_unplug(conf->mddev->queue));	752	({ flush_pending_writes(conf);
		753	raid10_unplug(conf->mddev->queue); }));
724	spin_unlock_irq(&conf->resync_lock);	754	spin_unlock_irq(&conf->resync_lock);
725	}	755	}
726		756
@@ -892,6 +922,9 @@ static int make_request(struct request_queue q, struct bio bio)
892	blk_plug_device(mddev->queue);	922	blk_plug_device(mddev->queue);
893	spin_unlock_irqrestore(&conf->device_lock, flags);	923	spin_unlock_irqrestore(&conf->device_lock, flags);
894		924
		925	/* In case raid10d snuck in to freeze_array */
		926	wake_up(&conf->wait_barrier);
		927
895	if (do_sync)	928	if (do_sync)
896	md_wakeup_thread(mddev->thread);	929	md_wakeup_thread(mddev->thread);
897		930
@@ -1464,28 +1497,14 @@ static void raid10d(mddev_t *mddev)
1464		1497
1465	for (;;) {	1498	for (;;) {
1466	char b[BDEVNAME_SIZE];	1499	char b[BDEVNAME_SIZE];
1467	spin_lock_irqsave(&conf->device_lock, flags);
1468		1500
1469	if (conf->pending_bio_list.head) {	1501	unplug += flush_pending_writes(conf);
1470	bio = bio_list_get(&conf->pending_bio_list);
1471	blk_remove_plug(mddev->queue);
1472	spin_unlock_irqrestore(&conf->device_lock, flags);
1473	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
1474	bitmap_unplug(mddev->bitmap);
1475		1502
1476	while (bio) { /* submit pending writes */	1503	spin_lock_irqsave(&conf->device_lock, flags);
1477	struct bio *next = bio->bi_next;	1504	if (list_empty(head)) {
1478	bio->bi_next = NULL;	1505	spin_unlock_irqrestore(&conf->device_lock, flags);
1479	generic_make_request(bio);
1480	bio = next;
1481	}
1482	unplug = 1;
1483
1484	continue;
1485	}
1486
1487	if (list_empty(head))
1488	break;	1506	break;
		1507	}
1489	r10_bio = list_entry(head->prev, r10bio_t, retry_list);	1508	r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1490	list_del(head->prev);	1509	list_del(head->prev);
1491	conf->nr_queued--;	1510	conf->nr_queued--;
@@ -1548,7 +1567,6 @@ static void raid10d(mddev_t *mddev)
1548	}	1567	}
1549	}	1568	}
1550	}	1569	}
1551	spin_unlock_irqrestore(&conf->device_lock, flags);
1552	if (unplug)	1570	if (unplug)
1553	unplug_slaves(mddev);	1571	unplug_slaves(mddev);
1554	}	1572	}