dm raid1: fix EIO after log failure

This patch adds the ability to requeue write I/O to core device-mapper when there is a log device failure. If a write to the log produces and error, the pending writes are put on the "failures" list. Since the log is marked as failed, they will stay on the failures list until a suspend happens. Suspends come in two phases, presuspend and postsuspend. We must make sure that all the writes on the failures list are requeued in the presuspend phase (a requirement of dm core). This means that recovery must be complete (because writes may be delayed behind it) and the failures list must be requeued before we return from presuspend. The mechanisms to ensure recovery is complete (or stopped) was already in place, but needed to be moved from postsuspend to presuspend. We rely on 'flush_workqueue' to ensure that the mirror thread is complete and therefore, has requeued all writes in the failures list. Because we are using flush_workqueue, we must ensure that no additional 'queue_work' calls will produce additional I/O that we need to requeue (because once we return from presuspend, we are unable to do anything about it). 'queue_work' is called in response to the following functions: - complete_resync_work = NA, recovery is stopped - rh_dec (mirror_end_io) = NA, only calls 'queue_work' if it is ready to recover the region (recovery is stopped) or it needs to clear the region in the log* **this doesn't get called while suspending** - rh_recovery_end = NA, recovery is stopped - rh_recovery_start = NA, recovery is stopped - write_callback = 1) Writes w/o failures simply call bio_endio -> mirror_end_io -> rh_dec (see rh_dec above) 2) Writes with failures are put on the failures list and queue_work is called** ** write_callbacks don't happen during suspend ** - do_failures = NA, 'queue_work' not called if suspending - add_mirror (initialization) = NA, only done on mirror creation - queue_bio = NA, 1) delayed I/O scheduled before flush_workqueue is called. 2) No more I/Os are being issued. 3) Re-attempted READs can still be handled. (Write completions are handled through rh_dec/ write_callback - mention above - and do not use queue_bio.) Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
author: Jonathan Brassow <jbrassow@redhat.com> 2008-02-07 21:11:35 -0500
committer: Alasdair G Kergon <agk@redhat.com> 2008-02-07 21:11:35 -0500
commit: b80aa7a0c268d3ae0c472f648af1e3e4a359765c (patch)
tree: ce3d7f686a2b022dab3e0f8ddc1846d2ac4f6c58 /drivers
parent: 8f0205b798f926e2745de5fdebf0a8605c621de6 (diff)
1 files changed, 90 insertions, 11 deletions
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9978b9f07fe9..ec6d675bf766 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -146,6 +146,7 @@ struct mirror_set {
        region_t nr_regions;
        int in_sync;
        int log_failure;
+        atomic_t suspend;
        atomic_t default_mirror;        /* Default mirror */
@@ -372,6 +373,16 @@ static void complete_resync_work(struct region *reg, int success)
        struct region_hash *rh = reg->rh;
        rh->log->type->set_region_sync(rh->log, reg->key, success);
+        /*
+         * Dispatch the bios before we call 'wake_up_all'.
+         * This is important because if we are suspending,
+         * we want to know that recovery is complete and
+         * the work queue is flushed.  If we wake_up_all
+         * before we dispatch_bios (queue bios and call wake()),
+         * then we risk suspending before the work queue
+         * has been properly flushed.
+         */
        dispatch_bios(rh->ms, &reg->delayed_bios);
        if (atomic_dec_and_test(&rh->recovery_in_flight))
                wake_up_all(&_kmirrord_recovery_stopped);
@@ -1069,11 +1080,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
        /*
         * Dispatch io.
         */
-        if (unlikely(ms->log_failure))
+        if (unlikely(ms->log_failure)) {
+                spin_lock_irq(&ms->lock);
+                bio_list_merge(&ms->failures, &sync);
+                spin_unlock_irq(&ms->lock);
+        } else
                while ((bio = bio_list_pop(&sync)))
-                        bio_endio(bio, -EIO);
+                        do_write(ms, bio);
-        else while ((bio = bio_list_pop(&sync)))
-                do_write(ms, bio);
        while ((bio = bio_list_pop(&recover)))
                rh_delay(&ms->rh, bio);
@@ -1091,8 +1104,46 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
        if (!failures->head)
                return;
-        while ((bio = bio_list_pop(failures)))
+        if (!ms->log_failure) {
-                __bio_mark_nosync(ms, bio, bio->bi_size, 0);
+                while ((bio = bio_list_pop(failures)))
+                        __bio_mark_nosync(ms, bio, bio->bi_size, 0);
+                return;
+        }
+        /*
+         * If the log has failed, unattempted writes are being
+         * put on the failures list.  We can't issue those writes
+         * until a log has been marked, so we must store them.
+         *
+         * If a 'noflush' suspend is in progress, we can requeue
+         * the I/O's to the core.  This give userspace a chance
+         * to reconfigure the mirror, at which point the core
+         * will reissue the writes.  If the 'noflush' flag is
+         * not set, we have no choice but to return errors.
+         *
+         * Some writes on the failures list may have been
+         * submitted before the log failure and represent a
+         * failure to write to one of the devices.  It is ok
+         * for us to treat them the same and requeue them
+         * as well.
+         */
+        if (dm_noflush_suspending(ms->ti)) {
+                while ((bio = bio_list_pop(failures)))
+                        bio_endio(bio, DM_ENDIO_REQUEUE);
+                return;
+        }
+        if (atomic_read(&ms->suspend)) {
+                while ((bio = bio_list_pop(failures)))
+                        bio_endio(bio, -EIO);
+                return;
+        }
+        spin_lock_irq(&ms->lock);
+        bio_list_merge(&ms->failures, failures);
+        spin_unlock_irq(&ms->lock);
+        wake(ms);
 }
 static void trigger_event(struct work_struct *work)
@@ -1176,6 +1227,8 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
        ms->nr_mirrors = nr_mirrors;
        ms->nr_regions = dm_sector_div_up(ti->len, region_size);
        ms->in_sync = 0;
+        ms->log_failure = 0;
+        atomic_set(&ms->suspend, 0);
        atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
        ms->io_client = dm_io_client_create(DM_IO_PAGES);
@@ -1511,26 +1564,51 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
        return 0;
 }
-static void mirror_postsuspend(struct dm_target *ti)
+static void mirror_presuspend(struct dm_target *ti)
 {
        struct mirror_set *ms = (struct mirror_set *) ti->private;
        struct dirty_log *log = ms->rh.log;
+        atomic_set(&ms->suspend, 1);
+        /*
+         * We must finish up all the work that we've
+         * generated (i.e. recovery work).
+         */
        rh_stop_recovery(&ms->rh);
-        /* Wait for all I/O we generated to complete */
        wait_event(_kmirrord_recovery_stopped,
                   !atomic_read(&ms->rh.recovery_in_flight));
+        if (log->type->presuspend && log->type->presuspend(log))
+                /* FIXME: need better error handling */
+                DMWARN("log presuspend failed");
+        /*
+         * Now that recovery is complete/stopped and the
+         * delayed bios are queued, we need to wait for
+         * the worker thread to complete.  This way,
+         * we know that all of our I/O has been pushed.
+         */
+        flush_workqueue(ms->kmirrord_wq);
+}
+static void mirror_postsuspend(struct dm_target *ti)
+{
+        struct mirror_set *ms = ti->private;
+        struct dirty_log *log = ms->rh.log;
        if (log->type->postsuspend && log->type->postsuspend(log))
                /* FIXME: need better error handling */
-                DMWARN("log suspend failed");
+                DMWARN("log postsuspend failed");
 }
 static void mirror_resume(struct dm_target *ti)
 {
-        struct mirror_set *ms = (struct mirror_set *) ti->private;
+        struct mirror_set *ms = ti->private;
        struct dirty_log *log = ms->rh.log;
+        atomic_set(&ms->suspend, 0);
        if (log->type->resume && log->type->resume(log))
                /* FIXME: need better error handling */
                DMWARN("log resume failed");
@@ -1564,7 +1642,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
                DMEMIT("%d", ms->nr_mirrors);
                for (m = 0; m < ms->nr_mirrors; m++)
                        DMEMIT(" %s %llu", ms->mirror[m].dev->name,
-                                (unsigned long long)ms->mirror[m].offset);
+                               (unsigned long long)ms->mirror[m].offset);
                if (ms->features & DM_RAID1_HANDLE_ERRORS)
                        DMEMIT(" 1 handle_errors");
@@ -1581,6 +1659,7 @@ static struct target_type mirror_target = {
        .dtr     = mirror_dtr,
        .map     = mirror_map,
        .end_io  = mirror_end_io,
+        .presuspend = mirror_presuspend,
        .postsuspend = mirror_postsuspend,
        .resume  = mirror_resume,
        .status  = mirror_status,
author	Jonathan Brassow <jbrassow@redhat.com>	2008-02-07 21:11:35 -0500
committer	Alasdair G Kergon <agk@redhat.com>	2008-02-07 21:11:35 -0500
commit	b80aa7a0c268d3ae0c472f648af1e3e4a359765c (patch)
tree	ce3d7f686a2b022dab3e0f8ddc1846d2ac4f6c58 /drivers
parent	8f0205b798f926e2745de5fdebf0a8605c621de6 (diff)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9978b9f07fe9..ec6d675bf766 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c
@@ -146,6 +146,7 @@ struct mirror_set {
146	region_t nr_regions;	146	region_t nr_regions;
147	int in_sync;	147	int in_sync;
148	int log_failure;	148	int log_failure;
		149	atomic_t suspend;
149		150
150	atomic_t default_mirror; /* Default mirror */	151	atomic_t default_mirror; /* Default mirror */
151		152
@@ -372,6 +373,16 @@ static void complete_resync_work(struct region *reg, int success)
372	struct region_hash *rh = reg->rh;	373	struct region_hash *rh = reg->rh;
373		374
374	rh->log->type->set_region_sync(rh->log, reg->key, success);	375	rh->log->type->set_region_sync(rh->log, reg->key, success);
		376
		377	/*
		378	* Dispatch the bios before we call 'wake_up_all'.
		379	* This is important because if we are suspending,
		380	* we want to know that recovery is complete and
		381	* the work queue is flushed. If we wake_up_all
		382	* before we dispatch_bios (queue bios and call wake()),
		383	* then we risk suspending before the work queue
		384	* has been properly flushed.
		385	*/
375	dispatch_bios(rh->ms, &reg->delayed_bios);	386	dispatch_bios(rh->ms, &reg->delayed_bios);
376	if (atomic_dec_and_test(&rh->recovery_in_flight))	387	if (atomic_dec_and_test(&rh->recovery_in_flight))
377	wake_up_all(&_kmirrord_recovery_stopped);	388	wake_up_all(&_kmirrord_recovery_stopped);
@@ -1069,11 +1080,13 @@ static void do_writes(struct mirror_set ms, struct bio_list writes)
1069	/*	1080	/*
1070	* Dispatch io.	1081	* Dispatch io.
1071	*/	1082	*/
1072	if (unlikely(ms->log_failure))	1083	if (unlikely(ms->log_failure)) {
		1084	spin_lock_irq(&ms->lock);
		1085	bio_list_merge(&ms->failures, &sync);
		1086	spin_unlock_irq(&ms->lock);
		1087	} else
1073	while ((bio = bio_list_pop(&sync)))	1088	while ((bio = bio_list_pop(&sync)))
1074	bio_endio(bio, -EIO);	1089	do_write(ms, bio);
1075	else while ((bio = bio_list_pop(&sync)))
1076	do_write(ms, bio);
1077		1090
1078	while ((bio = bio_list_pop(&recover)))	1091	while ((bio = bio_list_pop(&recover)))
1079	rh_delay(&ms->rh, bio);	1092	rh_delay(&ms->rh, bio);
@@ -1091,8 +1104,46 @@ static void do_failures(struct mirror_set ms, struct bio_list failures)
1091	if (!failures->head)	1104	if (!failures->head)
1092	return;	1105	return;
1093		1106
1094	while ((bio = bio_list_pop(failures)))	1107	if (!ms->log_failure) {
1095	__bio_mark_nosync(ms, bio, bio->bi_size, 0);	1108	while ((bio = bio_list_pop(failures)))
		1109	__bio_mark_nosync(ms, bio, bio->bi_size, 0);
		1110	return;
		1111	}
		1112
		1113	/*
		1114	* If the log has failed, unattempted writes are being
		1115	* put on the failures list. We can't issue those writes
		1116	* until a log has been marked, so we must store them.
		1117	*
		1118	* If a 'noflush' suspend is in progress, we can requeue
		1119	* the I/O's to the core. This give userspace a chance
		1120	* to reconfigure the mirror, at which point the core
		1121	* will reissue the writes. If the 'noflush' flag is
		1122	* not set, we have no choice but to return errors.
		1123	*
		1124	* Some writes on the failures list may have been
		1125	* submitted before the log failure and represent a
		1126	* failure to write to one of the devices. It is ok
		1127	* for us to treat them the same and requeue them
		1128	* as well.
		1129	*/
		1130	if (dm_noflush_suspending(ms->ti)) {
		1131	while ((bio = bio_list_pop(failures)))
		1132	bio_endio(bio, DM_ENDIO_REQUEUE);
		1133	return;
		1134	}
		1135
		1136	if (atomic_read(&ms->suspend)) {
		1137	while ((bio = bio_list_pop(failures)))
		1138	bio_endio(bio, -EIO);
		1139	return;
		1140	}
		1141
		1142	spin_lock_irq(&ms->lock);
		1143	bio_list_merge(&ms->failures, failures);
		1144	spin_unlock_irq(&ms->lock);
		1145
		1146	wake(ms);
1096	}	1147	}
1097		1148
1098	static void trigger_event(struct work_struct *work)	1149	static void trigger_event(struct work_struct *work)
@@ -1176,6 +1227,8 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
1176	ms->nr_mirrors = nr_mirrors;	1227	ms->nr_mirrors = nr_mirrors;
1177	ms->nr_regions = dm_sector_div_up(ti->len, region_size);	1228	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
1178	ms->in_sync = 0;	1229	ms->in_sync = 0;
		1230	ms->log_failure = 0;
		1231	atomic_set(&ms->suspend, 0);
1179	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);	1232	atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
1180		1233
1181	ms->io_client = dm_io_client_create(DM_IO_PAGES);	1234	ms->io_client = dm_io_client_create(DM_IO_PAGES);
@@ -1511,26 +1564,51 @@ static int mirror_end_io(struct dm_target ti, struct bio bio,
1511	return 0;	1564	return 0;
1512	}	1565	}
1513		1566
1514	static void mirror_postsuspend(struct dm_target *ti)	1567	static void mirror_presuspend(struct dm_target *ti)
1515	{	1568	{
1516	struct mirror_set ms = (struct mirror_set ) ti->private;	1569	struct mirror_set ms = (struct mirror_set ) ti->private;
1517	struct dirty_log *log = ms->rh.log;	1570	struct dirty_log *log = ms->rh.log;
1518		1571
		1572	atomic_set(&ms->suspend, 1);
		1573
		1574	/*
		1575	* We must finish up all the work that we've
		1576	* generated (i.e. recovery work).
		1577	*/
1519	rh_stop_recovery(&ms->rh);	1578	rh_stop_recovery(&ms->rh);
1520		1579
1521	/* Wait for all I/O we generated to complete */
1522	wait_event(_kmirrord_recovery_stopped,	1580	wait_event(_kmirrord_recovery_stopped,
1523	!atomic_read(&ms->rh.recovery_in_flight));	1581	!atomic_read(&ms->rh.recovery_in_flight));
1524		1582
		1583	if (log->type->presuspend && log->type->presuspend(log))
		1584	/* FIXME: need better error handling */
		1585	DMWARN("log presuspend failed");
		1586
		1587	/*
		1588	* Now that recovery is complete/stopped and the
		1589	* delayed bios are queued, we need to wait for
		1590	* the worker thread to complete. This way,
		1591	* we know that all of our I/O has been pushed.
		1592	*/
		1593	flush_workqueue(ms->kmirrord_wq);
		1594	}
		1595
		1596	static void mirror_postsuspend(struct dm_target *ti)
		1597	{
		1598	struct mirror_set *ms = ti->private;
		1599	struct dirty_log *log = ms->rh.log;
		1600
1525	if (log->type->postsuspend && log->type->postsuspend(log))	1601	if (log->type->postsuspend && log->type->postsuspend(log))
1526	/* FIXME: need better error handling */	1602	/* FIXME: need better error handling */
1527	DMWARN("log suspend failed");	1603	DMWARN("log postsuspend failed");
1528	}	1604	}
1529		1605
1530	static void mirror_resume(struct dm_target *ti)	1606	static void mirror_resume(struct dm_target *ti)
1531	{	1607	{
1532	struct mirror_set ms = (struct mirror_set ) ti->private;	1608	struct mirror_set *ms = ti->private;
1533	struct dirty_log *log = ms->rh.log;	1609	struct dirty_log *log = ms->rh.log;
		1610
		1611	atomic_set(&ms->suspend, 0);
1534	if (log->type->resume && log->type->resume(log))	1612	if (log->type->resume && log->type->resume(log))
1535	/* FIXME: need better error handling */	1613	/* FIXME: need better error handling */
1536	DMWARN("log resume failed");	1614	DMWARN("log resume failed");
@@ -1564,7 +1642,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1564	DMEMIT("%d", ms->nr_mirrors);	1642	DMEMIT("%d", ms->nr_mirrors);
1565	for (m = 0; m < ms->nr_mirrors; m++)	1643	for (m = 0; m < ms->nr_mirrors; m++)
1566	DMEMIT(" %s %llu", ms->mirror[m].dev->name,	1644	DMEMIT(" %s %llu", ms->mirror[m].dev->name,
1567	(unsigned long long)ms->mirror[m].offset);	1645	(unsigned long long)ms->mirror[m].offset);
1568		1646
1569	if (ms->features & DM_RAID1_HANDLE_ERRORS)	1647	if (ms->features & DM_RAID1_HANDLE_ERRORS)
1570	DMEMIT(" 1 handle_errors");	1648	DMEMIT(" 1 handle_errors");
@@ -1581,6 +1659,7 @@ static struct target_type mirror_target = {
1581	.dtr = mirror_dtr,	1659	.dtr = mirror_dtr,
1582	.map = mirror_map,	1660	.map = mirror_map,
1583	.end_io = mirror_end_io,	1661	.end_io = mirror_end_io,
		1662	.presuspend = mirror_presuspend,
1584	.postsuspend = mirror_postsuspend,	1663	.postsuspend = mirror_postsuspend,
1585	.resume = mirror_resume,	1664	.resume = mirror_resume,
1586	.status = mirror_status,	1665	.status = mirror_status,