aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJonathan Brassow <jbrassow@redhat.com>2008-02-07 21:11:35 -0500
committerAlasdair G Kergon <agk@redhat.com>2008-02-07 21:11:35 -0500
commitb80aa7a0c268d3ae0c472f648af1e3e4a359765c (patch)
treece3d7f686a2b022dab3e0f8ddc1846d2ac4f6c58
parent8f0205b798f926e2745de5fdebf0a8605c621de6 (diff)
dm raid1: fix EIO after log failure
This patch adds the ability to requeue write I/O to core device-mapper when there is a log device failure. If a write to the log produces and error, the pending writes are put on the "failures" list. Since the log is marked as failed, they will stay on the failures list until a suspend happens. Suspends come in two phases, presuspend and postsuspend. We must make sure that all the writes on the failures list are requeued in the presuspend phase (a requirement of dm core). This means that recovery must be complete (because writes may be delayed behind it) and the failures list must be requeued before we return from presuspend. The mechanisms to ensure recovery is complete (or stopped) was already in place, but needed to be moved from postsuspend to presuspend. We rely on 'flush_workqueue' to ensure that the mirror thread is complete and therefore, has requeued all writes in the failures list. Because we are using flush_workqueue, we must ensure that no additional 'queue_work' calls will produce additional I/O that we need to requeue (because once we return from presuspend, we are unable to do anything about it). 'queue_work' is called in response to the following functions: - complete_resync_work = NA, recovery is stopped - rh_dec (mirror_end_io) = NA, only calls 'queue_work' if it is ready to recover the region (recovery is stopped) or it needs to clear the region in the log* **this doesn't get called while suspending** - rh_recovery_end = NA, recovery is stopped - rh_recovery_start = NA, recovery is stopped - write_callback = 1) Writes w/o failures simply call bio_endio -> mirror_end_io -> rh_dec (see rh_dec above) 2) Writes with failures are put on the failures list and queue_work is called** ** write_callbacks don't happen during suspend ** - do_failures = NA, 'queue_work' not called if suspending - add_mirror (initialization) = NA, only done on mirror creation - queue_bio = NA, 1) delayed I/O scheduled before flush_workqueue is called. 2) No more I/Os are being issued. 3) Re-attempted READs can still be handled. (Write completions are handled through rh_dec/ write_callback - mention above - and do not use queue_bio.) Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
-rw-r--r--drivers/md/dm-raid1.c101
1 files changed, 90 insertions, 11 deletions
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9978b9f07fe9..ec6d675bf766 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -146,6 +146,7 @@ struct mirror_set {
146 region_t nr_regions; 146 region_t nr_regions;
147 int in_sync; 147 int in_sync;
148 int log_failure; 148 int log_failure;
149 atomic_t suspend;
149 150
150 atomic_t default_mirror; /* Default mirror */ 151 atomic_t default_mirror; /* Default mirror */
151 152
@@ -372,6 +373,16 @@ static void complete_resync_work(struct region *reg, int success)
372 struct region_hash *rh = reg->rh; 373 struct region_hash *rh = reg->rh;
373 374
374 rh->log->type->set_region_sync(rh->log, reg->key, success); 375 rh->log->type->set_region_sync(rh->log, reg->key, success);
376
377 /*
378 * Dispatch the bios before we call 'wake_up_all'.
379 * This is important because if we are suspending,
380 * we want to know that recovery is complete and
381 * the work queue is flushed. If we wake_up_all
382 * before we dispatch_bios (queue bios and call wake()),
383 * then we risk suspending before the work queue
384 * has been properly flushed.
385 */
375 dispatch_bios(rh->ms, &reg->delayed_bios); 386 dispatch_bios(rh->ms, &reg->delayed_bios);
376 if (atomic_dec_and_test(&rh->recovery_in_flight)) 387 if (atomic_dec_and_test(&rh->recovery_in_flight))
377 wake_up_all(&_kmirrord_recovery_stopped); 388 wake_up_all(&_kmirrord_recovery_stopped);
@@ -1069,11 +1080,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
1069 /* 1080 /*
1070 * Dispatch io. 1081 * Dispatch io.
1071 */ 1082 */
1072 if (unlikely(ms->log_failure)) 1083 if (unlikely(ms->log_failure)) {
1084 spin_lock_irq(&ms->lock);
1085 bio_list_merge(&ms->failures, &sync);
1086 spin_unlock_irq(&ms->lock);
1087 } else
1073 while ((bio = bio_list_pop(&sync))) 1088 while ((bio = bio_list_pop(&sync)))
1074 bio_endio(bio, -EIO); 1089 do_write(ms, bio);
1075 else while ((bio = bio_list_pop(&sync)))
1076 do_write(ms, bio);
1077 1090
1078 while ((bio = bio_list_pop(&recover))) 1091 while ((bio = bio_list_pop(&recover)))
1079 rh_delay(&ms->rh, bio); 1092 rh_delay(&ms->rh, bio);
@@ -1091,8 +1104,46 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
1091 if (!failures->head) 1104 if (!failures->head)
1092 return; 1105 return;
1093 1106
1094 while ((bio = bio_list_pop(failures))) 1107 if (!ms->log_failure) {
1095 __bio_mark_nosync(ms, bio, bio->bi_size, 0); 1108 while ((bio = bio_list_pop(failures)))
1109 __bio_mark_nosync(ms, bio, bio->bi_size, 0);
1110 return;
1111 }
1112
1113 /*
1114 * If the log has failed, unattempted writes are being
1115 * put on the failures list. We can't issue those writes
1116 * until a log has been marked, so we must store them.
1117 *
1118 * If a 'noflush' suspend is in progress, we can requeue
1119 * the I/O's to the core. This give userspace a chance
1120 * to reconfigure the mirror, at which point the core
1121 * will reissue the writes. If the 'noflush' flag is
1122 * not set, we have no choice but to return errors.
1123 *
1124 * Some writes on the failures list may have been
1125 * submitted before the log failure and represent a
1126 * failure to write to one of the devices. It is ok
1127 * for us to treat them the same and requeue them
1128 * as well.
1129 */
1130 if (dm_noflush_suspending(ms->ti)) {
1131 while ((bio = bio_list_pop(failures)))
1132 bio_endio(bio, DM_ENDIO_REQUEUE);
1133 return;
1134 }
1135
1136 if (atomic_read(&ms->suspend)) {
1137 while ((bio = bio_list_pop(failures)))
1138 bio_endio(bio, -EIO);
1139 return;
1140 }
1141
1142 spin_lock_irq(&ms->lock);
1143 bio_list_merge(&ms->failures, failures);
1144 spin_unlock_irq(&ms->lock);
1145
1146 wake(ms);
1096} 1147}
1097 1148
1098static void trigger_event(struct work_struct *work) 1149static void trigger_event(struct work_struct *work)
@@ -1176,6 +1227,8 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
1176 ms->nr_mirrors = nr_mirrors; 1227 ms->nr_mirrors = nr_mirrors;
1177 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 1228 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
1178 ms->in_sync = 0; 1229 ms->in_sync = 0;
1230 ms->log_failure = 0;
1231 atomic_set(&ms->suspend, 0);
1179 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 1232 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
1180 1233
1181 ms->io_client = dm_io_client_create(DM_IO_PAGES); 1234 ms->io_client = dm_io_client_create(DM_IO_PAGES);
@@ -1511,26 +1564,51 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1511 return 0; 1564 return 0;
1512} 1565}
1513 1566
1514static void mirror_postsuspend(struct dm_target *ti) 1567static void mirror_presuspend(struct dm_target *ti)
1515{ 1568{
1516 struct mirror_set *ms = (struct mirror_set *) ti->private; 1569 struct mirror_set *ms = (struct mirror_set *) ti->private;
1517 struct dirty_log *log = ms->rh.log; 1570 struct dirty_log *log = ms->rh.log;
1518 1571
1572 atomic_set(&ms->suspend, 1);
1573
1574 /*
1575 * We must finish up all the work that we've
1576 * generated (i.e. recovery work).
1577 */
1519 rh_stop_recovery(&ms->rh); 1578 rh_stop_recovery(&ms->rh);
1520 1579
1521 /* Wait for all I/O we generated to complete */
1522 wait_event(_kmirrord_recovery_stopped, 1580 wait_event(_kmirrord_recovery_stopped,
1523 !atomic_read(&ms->rh.recovery_in_flight)); 1581 !atomic_read(&ms->rh.recovery_in_flight));
1524 1582
1583 if (log->type->presuspend && log->type->presuspend(log))
1584 /* FIXME: need better error handling */
1585 DMWARN("log presuspend failed");
1586
1587 /*
1588 * Now that recovery is complete/stopped and the
1589 * delayed bios are queued, we need to wait for
1590 * the worker thread to complete. This way,
1591 * we know that all of our I/O has been pushed.
1592 */
1593 flush_workqueue(ms->kmirrord_wq);
1594}
1595
1596static void mirror_postsuspend(struct dm_target *ti)
1597{
1598 struct mirror_set *ms = ti->private;
1599 struct dirty_log *log = ms->rh.log;
1600
1525 if (log->type->postsuspend && log->type->postsuspend(log)) 1601 if (log->type->postsuspend && log->type->postsuspend(log))
1526 /* FIXME: need better error handling */ 1602 /* FIXME: need better error handling */
1527 DMWARN("log suspend failed"); 1603 DMWARN("log postsuspend failed");
1528} 1604}
1529 1605
1530static void mirror_resume(struct dm_target *ti) 1606static void mirror_resume(struct dm_target *ti)
1531{ 1607{
1532 struct mirror_set *ms = (struct mirror_set *) ti->private; 1608 struct mirror_set *ms = ti->private;
1533 struct dirty_log *log = ms->rh.log; 1609 struct dirty_log *log = ms->rh.log;
1610
1611 atomic_set(&ms->suspend, 0);
1534 if (log->type->resume && log->type->resume(log)) 1612 if (log->type->resume && log->type->resume(log))
1535 /* FIXME: need better error handling */ 1613 /* FIXME: need better error handling */
1536 DMWARN("log resume failed"); 1614 DMWARN("log resume failed");
@@ -1564,7 +1642,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1564 DMEMIT("%d", ms->nr_mirrors); 1642 DMEMIT("%d", ms->nr_mirrors);
1565 for (m = 0; m < ms->nr_mirrors; m++) 1643 for (m = 0; m < ms->nr_mirrors; m++)
1566 DMEMIT(" %s %llu", ms->mirror[m].dev->name, 1644 DMEMIT(" %s %llu", ms->mirror[m].dev->name,
1567 (unsigned long long)ms->mirror[m].offset); 1645 (unsigned long long)ms->mirror[m].offset);
1568 1646
1569 if (ms->features & DM_RAID1_HANDLE_ERRORS) 1647 if (ms->features & DM_RAID1_HANDLE_ERRORS)
1570 DMEMIT(" 1 handle_errors"); 1648 DMEMIT(" 1 handle_errors");
@@ -1581,6 +1659,7 @@ static struct target_type mirror_target = {
1581 .dtr = mirror_dtr, 1659 .dtr = mirror_dtr,
1582 .map = mirror_map, 1660 .map = mirror_map,
1583 .end_io = mirror_end_io, 1661 .end_io = mirror_end_io,
1662 .presuspend = mirror_presuspend,
1584 .postsuspend = mirror_postsuspend, 1663 .postsuspend = mirror_postsuspend,
1585 .resume = mirror_resume, 1664 .resume = mirror_resume,
1586 .status = mirror_status, 1665 .status = mirror_status,