1 files changed, 156 insertions, 63 deletions
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index cc9dc79b0784..ad779bd13aec 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
 *---------------------------------------------------------------*/
 enum dm_raid1_error {
        DM_RAID1_WRITE_ERROR,
+        DM_RAID1_FLUSH_ERROR,
        DM_RAID1_SYNC_ERROR,
        DM_RAID1_READ_ERROR
 };
@@ -57,6 +58,7 @@ struct mirror_set {
        struct bio_list reads;
        struct bio_list writes;
        struct bio_list failures;
+        struct bio_list holds;  /* bios are waiting until suspend */
        struct dm_region_hash *rh;
        struct dm_kcopyd_client *kcopyd_client;
@@ -67,6 +69,7 @@ struct mirror_set {
        region_t nr_regions;
        int in_sync;
        int log_failure;
+        int leg_failure;
        atomic_t suspend;
        atomic_t default_mirror;        /* Default mirror */
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m)
        atomic_set(&ms->default_mirror, m - m0);
 }
+static struct mirror *get_valid_mirror(struct mirror_set *ms)
+{
+        struct mirror *m;
+        for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
+                if (!atomic_read(&m->error_count))
+                        return m;
+        return NULL;
+}
 /* fail_mirror
 * @m: mirror device to fail
 * @error_type: one of the enum's, DM_RAID1_*_ERROR
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
        struct mirror_set *ms = m->ms;
        struct mirror *new;
+        ms->leg_failure = 1;
        /*
         * error_count is used for nothing more than a
         * simple way to tell if a device has encountered
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
                goto out;
        }
-        for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
+        new = get_valid_mirror(ms);
-                if (!atomic_read(&new->error_count)) {
+        if (new)
-                        set_default_mirror(new);
+                set_default_mirror(new);
-                        break;
+        else
-                }
-        if (unlikely(new == ms->mirror + ms->nr_mirrors))
                DMWARN("All sides of mirror have failed.");
 out:
        schedule_work(&ms->trigger_event);
 }
+static int mirror_flush(struct dm_target *ti)
+{
+        struct mirror_set *ms = ti->private;
+        unsigned long error_bits;
+        unsigned int i;
+        struct dm_io_region io[ms->nr_mirrors];
+        struct mirror *m;
+        struct dm_io_request io_req = {
+                .bi_rw = WRITE_BARRIER,
+                .mem.type = DM_IO_KMEM,
+                .mem.ptr.bvec = NULL,
+                .client = ms->io_client,
+        };
+        for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
+                io[i].bdev = m->dev->bdev;
+                io[i].sector = 0;
+                io[i].count = 0;
+        }
+        error_bits = -1;
+        dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
+        if (unlikely(error_bits != 0)) {
+                for (i = 0; i < ms->nr_mirrors; i++)
+                        if (test_bit(i, &error_bits))
+                                fail_mirror(ms->mirror + i,
+                                            DM_RAID1_FLUSH_ERROR);
+                return -EIO;
+        }
+        return 0;
+}
 /*-----------------------------------------------------------------
 * Recovery.
 *
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
 */
 static sector_t map_sector(struct mirror *m, struct bio *bio)
 {
+        if (unlikely(!bio->bi_size))
+                return 0;
        return m->offset + (bio->bi_sector - m->ms->ti->begin);
 }
@@ -413,6 +462,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
        io->count = bio->bi_size >> 9;
 }
+static void hold_bio(struct mirror_set *ms, struct bio *bio)
+{
+        /*
+         * If device is suspended, complete the bio.
+         */
+        if (atomic_read(&ms->suspend)) {
+                if (dm_noflush_suspending(ms->ti))
+                        bio_endio(bio, DM_ENDIO_REQUEUE);
+                else
+                        bio_endio(bio, -EIO);
+                return;
+        }
+        /*
+         * Hold bio until the suspend is complete.
+         */
+        spin_lock_irq(&ms->lock);
+        bio_list_add(&ms->holds, bio);
+        spin_unlock_irq(&ms->lock);
+}
 /*-----------------------------------------------------------------
 * Reads
 *---------------------------------------------------------------*/
@@ -511,7 +581,6 @@ static void write_callback(unsigned long error, void *context)
        unsigned i, ret = 0;
        struct bio *bio = (struct bio *) context;
        struct mirror_set *ms;
-        int uptodate = 0;
        int should_wake = 0;
        unsigned long flags;
@@ -524,36 +593,27 @@ static void write_callback(unsigned long error, void *context)
         * This way we handle both writes to SYNC and NOSYNC
         * regions with the same code.
         */
-        if (likely(!error))
+        if (likely(!error)) {
-                goto out;
+                bio_endio(bio, ret);
+                return;
+        }
        for (i = 0; i < ms->nr_mirrors; i++)
                if (test_bit(i, &error))
                        fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
-                else
-                        uptodate = 1;
-        if (unlikely(!uptodate)) {
+        /*
-                DMERR("All replicated volumes dead, failing I/O");
+         * Need to raise event.  Since raising
-                /* None of the writes succeeded, fail the I/O. */
+         * events can block, we need to do it in
-                ret = -EIO;
+         * the main thread.
-        } else if (errors_handled(ms)) {
+         */
-                /*
+        spin_lock_irqsave(&ms->lock, flags);
-                 * Need to raise event.  Since raising
+        if (!ms->failures.head)
-                 * events can block, we need to do it in
+                should_wake = 1;
-                 * the main thread.
+        bio_list_add(&ms->failures, bio);
-                 */
+        spin_unlock_irqrestore(&ms->lock, flags);
-                spin_lock_irqsave(&ms->lock, flags);
+        if (should_wake)
-                if (!ms->failures.head)
+                wakeup_mirrord(ms);
-                        should_wake = 1;
-                bio_list_add(&ms->failures, bio);
-                spin_unlock_irqrestore(&ms->lock, flags);
-                if (should_wake)
-                        wakeup_mirrord(ms);
-                return;
-        }
-out:
-        bio_endio(bio, ret);
 }
 static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -562,7 +622,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
        struct dm_io_region io[ms->nr_mirrors], *dest = io;
        struct mirror *m;
        struct dm_io_request io_req = {
-                .bi_rw = WRITE,
+                .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
                .mem.type = DM_IO_BVEC,
                .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
                .notify.fn = write_callback,
@@ -603,6 +663,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
        bio_list_init(&requeue);
        while ((bio = bio_list_pop(writes))) {
+                if (unlikely(bio_empty_barrier(bio))) {
+                        bio_list_add(&sync, bio);
+                        continue;
+                }
                region = dm_rh_bio_to_region(ms->rh, bio);
                if (log->type->is_remote_recovering &&
@@ -672,8 +737,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
                dm_rh_delay(ms->rh, bio);
        while ((bio = bio_list_pop(&nosync))) {
-                map_bio(get_default_mirror(ms), bio);
+                if (unlikely(ms->leg_failure) && errors_handled(ms))
-                generic_make_request(bio);
+                        hold_bio(ms, bio);
+                else {
+                        map_bio(get_default_mirror(ms), bio);
+                        generic_make_request(bio);
+                }
        }
 }
@@ -681,20 +750,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 {
        struct bio *bio;
-        if (!failures->head)
+        if (likely(!failures->head))
-                return;
-        if (!ms->log_failure) {
-                while ((bio = bio_list_pop(failures))) {
-                        ms->in_sync = 0;
-                        dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
-                }
                return;
-        }
        /*
         * If the log has failed, unattempted writes are being
-         * put on the failures list.  We can't issue those writes
+         * put on the holds list.  We can't issue those writes
         * until a log has been marked, so we must store them.
         *
         * If a 'noflush' suspend is in progress, we can requeue
@@ -709,23 +770,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
         * for us to treat them the same and requeue them
         * as well.
         */
-        if (dm_noflush_suspending(ms->ti)) {
+        while ((bio = bio_list_pop(failures))) {
-                while ((bio = bio_list_pop(failures)))
+                if (!ms->log_failure) {
-                        bio_endio(bio, DM_ENDIO_REQUEUE);
+                        ms->in_sync = 0;
-                return;
+                        dm_rh_mark_nosync(ms->rh, bio);
-        }
+                }
-        if (atomic_read(&ms->suspend)) {
+                /*
-                while ((bio = bio_list_pop(failures)))
+                 * If all the legs are dead, fail the I/O.
+                 * If we have been told to handle errors, hold the bio
+                 * and wait for userspace to deal with the problem.
+                 * Otherwise pretend that the I/O succeeded. (This would
+                 * be wrong if the failed leg returned after reboot and
+                 * got replicated back to the good legs.)
+                 */
+                if (!get_valid_mirror(ms))
                        bio_endio(bio, -EIO);
-                return;
+                else if (errors_handled(ms))
+                        hold_bio(ms, bio);
+                else
+                        bio_endio(bio, 0);
        }
-        spin_lock_irq(&ms->lock);
-        bio_list_merge(&ms->failures, failures);
-        spin_unlock_irq(&ms->lock);
-        delayed_wake(ms);
 }
 static void trigger_event(struct work_struct *work)
@@ -784,12 +849,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
        }
        spin_lock_init(&ms->lock);
+        bio_list_init(&ms->reads);
+        bio_list_init(&ms->writes);
+        bio_list_init(&ms->failures);
+        bio_list_init(&ms->holds);
        ms->ti = ti;
        ms->nr_mirrors = nr_mirrors;
        ms->nr_regions = dm_sector_div_up(ti->len, region_size);
        ms->in_sync = 0;
        ms->log_failure = 0;
+        ms->leg_failure = 0;
        atomic_set(&ms->suspend, 0);
        atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
@@ -889,7 +959,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
                return NULL;
        }
-        dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2);
+        dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
+                                 argv + 2);
        if (!dl) {
                ti->error = "Error creating mirror dirty log";
                return NULL;
@@ -995,6 +1066,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->private = ms;
        ti->split_io = dm_rh_get_region_size(ms->rh);
+        ti->num_flush_requests = 1;
        ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
        if (!ms->kmirrord_wq) {
@@ -1122,7 +1194,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
         * We need to dec pending if this was a write.
         */
        if (rw == WRITE) {
-                dm_rh_dec(ms->rh, map_context->ll);
+                if (likely(!bio_empty_barrier(bio)))
+                        dm_rh_dec(ms->rh, map_context->ll);
                return error;
        }
@@ -1180,6 +1253,9 @@ static void mirror_presuspend(struct dm_target *ti)
        struct mirror_set *ms = (struct mirror_set *) ti->private;
        struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+        struct bio_list holds;
+        struct bio *bio;
        atomic_set(&ms->suspend, 1);
        /*
@@ -1202,6 +1278,22 @@ static void mirror_presuspend(struct dm_target *ti)
         * we know that all of our I/O has been pushed.
         */
        flush_workqueue(ms->kmirrord_wq);
+        /*
+         * Now set ms->suspend is set and the workqueue flushed, no more
+         * entries can be added to ms->hold list, so process it.
+         *
+         * Bios can still arrive concurrently with or after this
+         * presuspend function, but they cannot join the hold list
+         * because ms->suspend is set.
+         */
+        spin_lock_irq(&ms->lock);
+        holds = ms->holds;
+        bio_list_init(&ms->holds);
+        spin_unlock_irq(&ms->lock);
+        while ((bio = bio_list_pop(&holds)))
+                hold_bio(ms, bio);
 }
 static void mirror_postsuspend(struct dm_target *ti)
@@ -1244,7 +1336,8 @@ static char device_status_char(struct mirror *m)
        if (!atomic_read(&(m->error_count)))
                return 'A';
-        return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
+        return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
+                (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
                (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
                (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
 }