Merge branch 'for-2.6.37/barrier' of git://git.kernel.dk/linux-2.6-block

* 'for-2.6.37/barrier' of git://git.kernel.dk/linux-2.6-block: (46 commits) xen-blkfront: disable barrier/flush write support Added blk-lib.c and blk-barrier.c was renamed to blk-flush.c block: remove BLKDEV_IFL_WAIT aic7xxx_old: removed unused 'req' variable block: remove the BH_Eopnotsupp flag block: remove the BLKDEV_IFL_BARRIER flag block: remove the WRITE_BARRIER flag swap: do not send discards as barriers fat: do not send discards as barriers ext4: do not send discards as barriers jbd2: replace barriers with explicit flush / FUA usage jbd2: Modify ASYNC_COMMIT code to not rely on queue draining on barrier jbd: replace barriers with explicit flush / FUA usage nilfs2: replace barriers with explicit flush / FUA usage reiserfs: replace barriers with explicit flush / FUA usage gfs2: replace barriers with explicit flush / FUA usage btrfs: replace barriers with explicit flush / FUA usage xfs: replace barriers with explicit flush / FUA usage block: pass gfp_mask and flags to sb_issue_discard dm: convey that all flushes are processed as empty ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-22 20:07:18 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-22 20:07:18 -0400
commit: a2887097f25cd38cadfc11d10769e2b349fb5eca (patch)
tree: cd4adcb305365d6ba9acd2c02d4eb9d0125c6f8d /drivers/md
parent: 8abfc6e7a45eb74e51904bbae676fae008b11366 (diff)
parent: 005a1d15f5a6b2bb4ada80349513effbf22b4588 (diff)
19 files changed, 226 insertions, 611 deletions
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 368e8e98f705..d5b0e4c0e702 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1278,7 +1278,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
        struct dm_crypt_io *io;
        struct crypt_config *cc;
-        if (unlikely(bio_empty_barrier(bio))) {
+        if (bio->bi_rw & REQ_FLUSH) {
                cc = ti->private;
                bio->bi_bdev = cc->dev->bdev;
                return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 0590c75b0ab6..136d4f71a116 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -31,7 +31,6 @@ struct dm_io_client {
 */
 struct io {
        unsigned long error_bits;
-        unsigned long eopnotsupp_bits;
        atomic_t count;
        struct task_struct *sleeper;
        struct dm_io_client *client;
@@ -130,11 +129,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
 *---------------------------------------------------------------*/
 static void dec_count(struct io *io, unsigned int region, int error)
 {
-        if (error) {
+        if (error)
                set_bit(region, &io->error_bits);
-                if (error == -EOPNOTSUPP)
-                        set_bit(region, &io->eopnotsupp_bits);
-        }
        if (atomic_dec_and_test(&io->count)) {
                if (io->sleeper)
@@ -310,8 +306,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
        sector_t remaining = where->count;
        /*
-         * where->count may be zero if rw holds a write barrier and we
+         * where->count may be zero if rw holds a flush and we need to
-         * need to send a zero-sized barrier.
+         * send a zero-sized flush.
         */
        do {
                /*
@@ -364,7 +360,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
         */
        for (i = 0; i < num_regions; i++) {
                *dp = old_pages;
-                if (where[i].count || (rw & REQ_HARDBARRIER))
+                if (where[i].count || (rw & REQ_FLUSH))
                        do_region(rw, i, where + i, dp, io);
        }
@@ -393,9 +389,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
                return -EIO;
        }
-retry:
        io->error_bits = 0;
-        io->eopnotsupp_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
        io->sleeper = current;
        io->client = client;
@@ -412,11 +406,6 @@ retry:
        }
        set_current_state(TASK_RUNNING);
-        if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
-                rw &= ~REQ_HARDBARRIER;
-                goto retry;
-        }
        if (error_bits)
                *error_bits = io->error_bits;
@@ -437,7 +426,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
        io = mempool_alloc(client->pool, GFP_NOIO);
        io->error_bits = 0;
-        io->eopnotsupp_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
        io->sleeper = NULL;
        io->client = client;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5a08be0222db..33420e68d153 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc)
                .count = 0,
        };
-        lc->io_req.bi_rw = WRITE_BARRIER;
+        lc->io_req.bi_rw = WRITE_FLUSH;
        return dm_io(&lc->io_req, 1, &null_location, NULL);
 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 7c081bcbc3cf..19a59b041c27 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -259,7 +259,7 @@ static int mirror_flush(struct dm_target *ti)
        struct dm_io_region io[ms->nr_mirrors];
        struct mirror *m;
        struct dm_io_request io_req = {
-                .bi_rw = WRITE_BARRIER,
+                .bi_rw = WRITE_FLUSH,
                .mem.type = DM_IO_KMEM,
                .mem.ptr.bvec = NULL,
                .client = ms->io_client,
@@ -629,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
        struct dm_io_region io[ms->nr_mirrors], *dest = io;
        struct mirror *m;
        struct dm_io_request io_req = {
-                .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
+                .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
                .mem.type = DM_IO_BVEC,
                .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
                .notify.fn = write_callback,
@@ -670,7 +670,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
        bio_list_init(&requeue);
        while ((bio = bio_list_pop(writes))) {
-                if (unlikely(bio_empty_barrier(bio))) {
+                if (bio->bi_rw & REQ_FLUSH) {
                        bio_list_add(&sync, bio);
                        continue;
                }
@@ -1203,7 +1203,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
         * We need to dec pending if this was a write.
         */
        if (rw == WRITE) {
-                if (likely(!bio_empty_barrier(bio)))
+                if (!(bio->bi_rw & REQ_FLUSH))
                        dm_rh_dec(ms->rh, map_context->ll);
                return error;
        }
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index bd5c58b28868..dad011aed0c9 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -81,9 +81,9 @@ struct dm_region_hash {
        struct list_head failed_recovered_regions;
        /*
-         * If there was a barrier failure no regions can be marked clean.
+         * If there was a flush failure no regions can be marked clean.
         */
-        int barrier_failure;
+        int flush_failure;
        void *context;
        sector_t target_begin;
@@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
        INIT_LIST_HEAD(&rh->quiesced_regions);
        INIT_LIST_HEAD(&rh->recovered_regions);
        INIT_LIST_HEAD(&rh->failed_recovered_regions);
-        rh->barrier_failure = 0;
+        rh->flush_failure = 0;
        rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
                                                      sizeof(struct dm_region));
@@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
        region_t region = dm_rh_bio_to_region(rh, bio);
        int recovering = 0;
-        if (bio_empty_barrier(bio)) {
+        if (bio->bi_rw & REQ_FLUSH) {
-                rh->barrier_failure = 1;
+                rh->flush_failure = 1;
                return;
        }
@@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
        struct bio *bio;
        for (bio = bios->head; bio; bio = bio->bi_next) {
-                if (bio_empty_barrier(bio))
+                if (bio->bi_rw & REQ_FLUSH)
                        continue;
                rh_inc(rh, dm_rh_bio_to_region(rh, bio));
        }
@@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
                 */
                /* do nothing for DM_RH_NOSYNC */
-                if (unlikely(rh->barrier_failure)) {
+                if (unlikely(rh->flush_failure)) {
                        /*
-                         * If a write barrier failed some time ago, we
+                         * If a write flush failed some time ago, we
                         * don't know whether or not this write made it
                         * to the disk, so we must resync the device.
                         */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index cc2bdb83f9ad..0b61792a2780 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -687,7 +687,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
        /*
         * Commit exceptions to disk.
         */
-        if (ps->valid && area_io(ps, WRITE_BARRIER))
+        if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
                ps->valid = 0;
        /*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f30f6e8d594e..53cf79d8bcbc 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1585,7 +1585,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
        chunk_t chunk;
        struct dm_snap_pending_exception *pe = NULL;
-        if (unlikely(bio_empty_barrier(bio))) {
+        if (bio->bi_rw & REQ_FLUSH) {
                bio->bi_bdev = s->cow->bdev;
                return DM_MAPIO_REMAPPED;
        }
@@ -1689,7 +1689,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
        int r = DM_MAPIO_REMAPPED;
        chunk_t chunk;
-        if (unlikely(bio_empty_barrier(bio))) {
+        if (bio->bi_rw & REQ_FLUSH) {
                if (!map_context->target_request_nr)
                        bio->bi_bdev = s->origin->bdev;
                else
@@ -2133,7 +2133,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
        struct dm_dev *dev = ti->private;
        bio->bi_bdev = dev->bdev;
-        if (unlikely(bio_empty_barrier(bio)))
+        if (bio->bi_rw & REQ_FLUSH)
                return DM_MAPIO_REMAPPED;
        /* Only tell snapshots if this is a write */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c297f6da91ea..f0371b4c4fbf 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -271,7 +271,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
        uint32_t stripe;
        unsigned target_request_nr;
-        if (unlikely(bio_empty_barrier(bio))) {
+        if (bio->bi_rw & REQ_FLUSH) {
                target_request_nr = map_context->target_request_nr;
                BUG_ON(target_request_nr >= sc->stripes);
                bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7967eca5a2d5..7cb1352f7e7a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -110,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_QUEUE_IO_TO_THREAD 6
 /*
 * Work processed by per-device workqueue.
@@ -144,24 +143,9 @@ struct mapped_device {
        spinlock_t deferred_lock;
        /*
-         * An error from the barrier request currently being processed.
+         * Processing queue (flush)
-         */
-        int barrier_error;
-        /*
-         * Protect barrier_error from concurrent endio processing
-         * in request-based dm.
-         */
-        spinlock_t barrier_error_lock;
-        /*
-         * Processing queue (flush/barriers)
         */
        struct workqueue_struct *wq;
-        struct work_struct barrier_work;
-        /* A pointer to the currently processing pre/post flush request */
-        struct request *flush_request;
        /*
         * The current mapping.
@@ -200,8 +184,8 @@ struct mapped_device {
        /* sysfs handle */
        struct kobject kobj;
-        /* zero-length barrier that will be cloned and submitted to targets */
+        /* zero-length flush that will be cloned and submitted to targets */
-        struct bio barrier_bio;
+        struct bio flush_bio;
 };
 /*
@@ -512,7 +496,7 @@ static void end_io_acct(struct dm_io *io)
        /*
         * After this is decremented the bio must not be touched if it is
-         * a barrier.
+         * a flush.
         */
        dm_disk(md)->part0.in_flight[rw] = pending =
                atomic_dec_return(&md->pending[rw]);
@@ -528,16 +512,12 @@ static void end_io_acct(struct dm_io *io)
 */
 static void queue_io(struct mapped_device *md, struct bio *bio)
 {
-        down_write(&md->io_lock);
+        unsigned long flags;
-        spin_lock_irq(&md->deferred_lock);
+        spin_lock_irqsave(&md->deferred_lock, flags);
        bio_list_add(&md->deferred, bio);
-        spin_unlock_irq(&md->deferred_lock);
+        spin_unlock_irqrestore(&md->deferred_lock, flags);
+        queue_work(md->wq, &md->work);
-        if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
-                queue_work(md->wq, &md->work);
-        up_write(&md->io_lock);
 }
 /*
@@ -625,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error)
                         * Target requested pushing back the I/O.
                         */
                        spin_lock_irqsave(&md->deferred_lock, flags);
-                        if (__noflush_suspending(md)) {
+                        if (__noflush_suspending(md))
-                                if (!(io->bio->bi_rw & REQ_HARDBARRIER))
+                                bio_list_add_head(&md->deferred, io->bio);
-                                        bio_list_add_head(&md->deferred,
+                        else
-                                                          io->bio);
-                        } else
                                /* noflush suspend was interrupted. */
                                io->error = -EIO;
                        spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -637,32 +615,23 @@ static void dec_pending(struct dm_io *io, int error)
                io_error = io->error;
                bio = io->bio;
+                end_io_acct(io);
+                free_io(md, io);
+                if (io_error == DM_ENDIO_REQUEUE)
+                        return;
-                if (bio->bi_rw & REQ_HARDBARRIER) {
+                if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
                        /*
-                         * There can be just one barrier request so we use
+                         * Preflush done for flush with data, reissue
-                         * a per-device variable for error reporting.
+                         * without REQ_FLUSH.
-                         * Note that you can't touch the bio after end_io_acct
-                         *
-                         * We ignore -EOPNOTSUPP for empty flush reported by
-                         * underlying devices. We assume that if the device
-                         * doesn't support empty barriers, it doesn't need
-                         * cache flushing commands.
                         */
-                        if (!md->barrier_error &&
+                        bio->bi_rw &= ~REQ_FLUSH;
-                            !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
+                        queue_io(md, bio);
-                                md->barrier_error = io_error;
-                        end_io_acct(io);
-                        free_io(md, io);
                } else {
-                        end_io_acct(io);
+                        /* done with normal IO or empty flush */
-                        free_io(md, io);
+                        trace_block_bio_complete(md->queue, bio);
+                        bio_endio(bio, io_error);
-                        if (io_error != DM_ENDIO_REQUEUE) {
-                                trace_block_bio_complete(md->queue, bio);
-                                bio_endio(bio, io_error);
-                        }
                }
        }
 }
@@ -755,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error)
        blk_update_request(tio->orig, 0, nr_bytes);
 }
-static void store_barrier_error(struct mapped_device *md, int error)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&md->barrier_error_lock, flags);
-        /*
-         * Basically, the first error is taken, but:
-         *   -EOPNOTSUPP supersedes any I/O error.
-         *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
-         */
-        if (!md->barrier_error || error == -EOPNOTSUPP ||
-            (md->barrier_error != -EOPNOTSUPP &&
-             error == DM_ENDIO_REQUEUE))
-                md->barrier_error = error;
-        spin_unlock_irqrestore(&md->barrier_error_lock, flags);
-}
 /*
 * Don't touch any member of the md after calling this function because
 * the md may be freed in dm_put() at the end of this function.
@@ -809,13 +761,11 @@ static void free_rq_clone(struct request *clone)
 static void dm_end_request(struct request *clone, int error)
 {
        int rw = rq_data_dir(clone);
-        int run_queue = 1;
-        bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct mapped_device *md = tio->md;
        struct request *rq = tio->orig;
-        if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
+        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                rq->errors = clone->errors;
                rq->resid_len = clone->resid_len;
@@ -829,15 +779,8 @@ static void dm_end_request(struct request *clone, int error)
        }
        free_rq_clone(clone);
+        blk_end_request_all(rq, error);
-        if (unlikely(is_barrier)) {
+        rq_completed(md, rw, true);
-                if (unlikely(error))
-                        store_barrier_error(md, error);
-                run_queue = 0;
-        } else
-                blk_end_request_all(rq, error);
-        rq_completed(md, rw, run_queue);
 }
 static void dm_unprep_request(struct request *rq)
@@ -862,16 +805,6 @@ void dm_requeue_unmapped_request(struct request *clone)
        struct request_queue *q = rq->q;
        unsigned long flags;
-        if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-                /*
-                 * Barrier clones share an original request.
-                 * Leave it to dm_end_request(), which handles this special
-                 * case.
-                 */
-                dm_end_request(clone, DM_ENDIO_REQUEUE);
-                return;
-        }
        dm_unprep_request(rq);
        spin_lock_irqsave(q->queue_lock, flags);
@@ -961,19 +894,6 @@ static void dm_complete_request(struct request *clone, int error)
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct request *rq = tio->orig;
-        if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-                /*
-                 * Barrier clones share an original request.  So can't use
-                 * softirq_done with the original.
-                 * Pass the clone to dm_done() directly in this special case.
-                 * It is safe (even if clone->q->queue_lock is held here)
-                 * because there is no I/O dispatching during the completion
-                 * of barrier clone.
-                 */
-                dm_done(clone, error, true);
-                return;
-        }
        tio->error = error;
        rq->completion_data = clone;
        blk_complete_request(rq);
@@ -990,17 +910,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct request *rq = tio->orig;
-        if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-                /*
-                 * Barrier clones share an original request.
-                 * Leave it to dm_end_request(), which handles this special
-                 * case.
-                 */
-                BUG_ON(error > 0);
-                dm_end_request(clone, error);
-                return;
-        }
        rq->cmd_flags |= REQ_FAILED;
        dm_complete_request(clone, error);
 }
@@ -1119,7 +1028,7 @@ static void dm_bio_destructor(struct bio *bio)
 }
 /*
- * Creates a little bio that is just does part of a bvec.
+ * Creates a little bio that just does part of a bvec.
 */
 static struct bio *split_bvec(struct bio *bio, sector_t sector,
                              unsigned short idx, unsigned int offset,
@@ -1134,7 +1043,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
        clone->bi_sector = sector;
        clone->bi_bdev = bio->bi_bdev;
-        clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+        clone->bi_rw = bio->bi_rw;
        clone->bi_vcnt = 1;
        clone->bi_size = to_bytes(len);
        clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1070,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
        clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
        __bio_clone(clone, bio);
-        clone->bi_rw &= ~REQ_HARDBARRIER;
        clone->bi_destructor = dm_bio_destructor;
        clone->bi_sector = sector;
        clone->bi_idx = idx;
@@ -1225,16 +1133,15 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
                __issue_target_request(ci, ti, request_nr, len);
 }
-static int __clone_and_map_empty_barrier(struct clone_info *ci)
+static int __clone_and_map_empty_flush(struct clone_info *ci)
 {
        unsigned target_nr = 0;
        struct dm_target *ti;
+        BUG_ON(bio_has_data(ci->bio));
        while ((ti = dm_table_get_target(ci->map, target_nr++)))
                __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
-        ci->sector_count = 0;
        return 0;
 }
@@ -1289,9 +1196,6 @@ static int __clone_and_map(struct clone_info *ci)
        sector_t len = 0, max;
        struct dm_target_io *tio;
-        if (unlikely(bio_empty_barrier(bio)))
-                return __clone_and_map_empty_barrier(ci);
        if (unlikely(bio->bi_rw & REQ_DISCARD))
                return __clone_and_map_discard(ci);
@@ -1383,16 +1287,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
        ci.map = dm_get_live_table(md);
        if (unlikely(!ci.map)) {
-                if (!(bio->bi_rw & REQ_HARDBARRIER))
+                bio_io_error(bio);
-                        bio_io_error(bio);
-                else
-                        if (!md->barrier_error)
-                                md->barrier_error = -EIO;
                return;
        }
        ci.md = md;
-        ci.bio = bio;
        ci.io = alloc_io(md);
        ci.io->error = 0;
        atomic_set(&ci.io->io_count, 1);
@@ -1400,14 +1299,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
        ci.io->md = md;
        spin_lock_init(&ci.io->endio_lock);
        ci.sector = bio->bi_sector;
-        ci.sector_count = bio_sectors(bio);
-        if (unlikely(bio_empty_barrier(bio)))
-                ci.sector_count = 1;
        ci.idx = bio->bi_idx;
        start_io_acct(ci.io);
-        while (ci.sector_count && !error)
+        if (bio->bi_rw & REQ_FLUSH) {
-                error = __clone_and_map(&ci);
+                ci.bio = &ci.md->flush_bio;
+                ci.sector_count = 0;
+                error = __clone_and_map_empty_flush(&ci);
+                /* dec_pending submits any data associated with flush */
+        } else {
+                ci.bio = bio;
+                ci.sector_count = bio_sectors(bio);
+                while (ci.sector_count && !error)
+                        error = __clone_and_map(&ci);
+        }
        /* drop the extra reference count */
        dec_pending(ci.io, error);
@@ -1491,22 +1396,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
        part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
        part_stat_unlock();
-        /*
+        /* if we're suspended, we have to queue this io for later */
-         * If we're suspended or the thread is processing barriers
+        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
-         * we have to queue this io for later.
-         */
-        if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-            unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
                up_read(&md->io_lock);
-                if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
+                if (bio_rw(bio) != READA)
-                    bio_rw(bio) == READA) {
+                        queue_io(md, bio);
+                else
                        bio_io_error(bio);
-                        return 0;
-                }
-                queue_io(md, bio);
                return 0;
        }
@@ -1537,14 +1434,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
        return _dm_request(q, bio);
 }
-static bool dm_rq_is_flush_request(struct request *rq)
-{
-        if (rq->cmd_flags & REQ_FLUSH)
-                return true;
-        else
-                return false;
-}
 void dm_dispatch_request(struct request *rq)
 {
        int r;
@@ -1592,22 +1481,15 @@ static int setup_clone(struct request *clone, struct request *rq,
 {
        int r;
-        if (dm_rq_is_flush_request(rq)) {
+        r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
-                blk_rq_init(NULL, clone);
+                              dm_rq_bio_constructor, tio);
-                clone->cmd_type = REQ_TYPE_FS;
+        if (r)
-                clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
+                return r;
-        } else {
-                r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
-                                      dm_rq_bio_constructor, tio);
-                if (r)
-                        return r;
-                clone->cmd = rq->cmd;
-                clone->cmd_len = rq->cmd_len;
-                clone->sense = rq->sense;
-                clone->buffer = rq->buffer;
-        }
+        clone->cmd = rq->cmd;
+        clone->cmd_len = rq->cmd_len;
+        clone->sense = rq->sense;
+        clone->buffer = rq->buffer;
        clone->end_io = end_clone_request;
        clone->end_io_data = tio;
@@ -1648,9 +1530,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
        struct mapped_device *md = q->queuedata;
        struct request *clone;
-        if (unlikely(dm_rq_is_flush_request(rq)))
-                return BLKPREP_OK;
        if (unlikely(rq->special)) {
                DMWARN("Already has something in rq->special.");
                return BLKPREP_KILL;
@@ -1727,6 +1606,7 @@ static void dm_request_fn(struct request_queue *q)
        struct dm_table *map = dm_get_live_table(md);
        struct dm_target *ti;
        struct request *rq, *clone;
+        sector_t pos;
        /*
         * For suspend, check blk_queue_stopped() and increment
@@ -1739,15 +1619,14 @@ static void dm_request_fn(struct request_queue *q)
                if (!rq)
                        goto plug_and_out;
-                if (unlikely(dm_rq_is_flush_request(rq))) {
+                /* always use block 0 to find the target for flushes for now */
-                        BUG_ON(md->flush_request);
+                pos = 0;
-                        md->flush_request = rq;
+                if (!(rq->cmd_flags & REQ_FLUSH))
-                        blk_start_request(rq);
+                        pos = blk_rq_pos(rq);
-                        queue_work(md->wq, &md->barrier_work);
-                        goto out;
+                ti = dm_table_find_target(map, pos);
-                }
+                BUG_ON(!dm_target_is_valid(ti));
-                ti = dm_table_find_target(map, blk_rq_pos(rq));
                if (ti->type->busy && ti->type->busy(ti))
                        goto plug_and_out;
@@ -1918,7 +1797,6 @@ out:
 static const struct block_device_operations dm_blk_dops;
 static void dm_wq_work(struct work_struct *work);
-static void dm_rq_barrier_work(struct work_struct *work);
 static void dm_init_md_queue(struct mapped_device *md)
 {
@@ -1940,6 +1818,7 @@ static void dm_init_md_queue(struct mapped_device *md)
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
        md->queue->unplug_fn = dm_unplug_all;
        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+        blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
 /*
@@ -1972,7 +1851,6 @@ static struct mapped_device *alloc_dev(int minor)
        mutex_init(&md->suspend_lock);
        mutex_init(&md->type_lock);
        spin_lock_init(&md->deferred_lock);
-        spin_lock_init(&md->barrier_error_lock);
        rwlock_init(&md->map_lock);
        atomic_set(&md->holders, 1);
        atomic_set(&md->open_count, 0);
@@ -1995,7 +1873,6 @@ static struct mapped_device *alloc_dev(int minor)
        atomic_set(&md->pending[1], 0);
        init_waitqueue_head(&md->wait);
        INIT_WORK(&md->work, dm_wq_work);
-        INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
        init_waitqueue_head(&md->eventq);
        md->disk->major = _major;
@@ -2015,6 +1892,10 @@ static struct mapped_device *alloc_dev(int minor)
        if (!md->bdev)
                goto bad_bdev;
+        bio_init(&md->flush_bio);
+        md->flush_bio.bi_bdev = md->bdev;
+        md->flush_bio.bi_rw = WRITE_FLUSH;
        /* Populate the mapping, nobody knows we exist yet */
        spin_lock(&_minor_lock);
        old_md = idr_replace(&_minor_idr, md, minor);
@@ -2245,7 +2126,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
        blk_queue_softirq_done(md->queue, dm_softirq_done);
        blk_queue_prep_rq(md->queue, dm_prep_fn);
        blk_queue_lld_busy(md->queue, dm_lld_busy);
-        blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
        elv_register_queue(md->queue);
@@ -2406,43 +2286,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
        return r;
 }
-static void dm_flush(struct mapped_device *md)
-{
-        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-        bio_init(&md->barrier_bio);
-        md->barrier_bio.bi_bdev = md->bdev;
-        md->barrier_bio.bi_rw = WRITE_BARRIER;
-        __split_and_process_bio(md, &md->barrier_bio);
-        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-}
-static void process_barrier(struct mapped_device *md, struct bio *bio)
-{
-        md->barrier_error = 0;
-        dm_flush(md);
-        if (!bio_empty_barrier(bio)) {
-                __split_and_process_bio(md, bio);
-                /*
-                 * If the request isn't supported, don't waste time with
-                 * the second flush.
-                 */
-                if (md->barrier_error != -EOPNOTSUPP)
-                        dm_flush(md);
-        }
-        if (md->barrier_error != DM_ENDIO_REQUEUE)
-                bio_endio(bio, md->barrier_error);
-        else {
-                spin_lock_irq(&md->deferred_lock);
-                bio_list_add_head(&md->deferred, bio);
-                spin_unlock_irq(&md->deferred_lock);
-        }
-}
 /*
 * Process the deferred bios
 */
@@ -2452,33 +2295,27 @@ static void dm_wq_work(struct work_struct *work)
                                                work);
        struct bio *c;
-        down_write(&md->io_lock);
+        down_read(&md->io_lock);
        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
                spin_lock_irq(&md->deferred_lock);
                c = bio_list_pop(&md->deferred);
                spin_unlock_irq(&md->deferred_lock);
-                if (!c) {
+                if (!c)
-                        clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
                        break;
-                }
-                up_write(&md->io_lock);
+                up_read(&md->io_lock);
                if (dm_request_based(md))
                        generic_make_request(c);
-                else {
+                else
-                        if (c->bi_rw & REQ_HARDBARRIER)
+                        __split_and_process_bio(md, c);
-                                process_barrier(md, c);
-                        else
-                                __split_and_process_bio(md, c);
-                }
-                down_write(&md->io_lock);
+                down_read(&md->io_lock);
        }
-        up_write(&md->io_lock);
+        up_read(&md->io_lock);
 }
 static void dm_queue_flush(struct mapped_device *md)
@@ -2488,73 +2325,6 @@ static void dm_queue_flush(struct mapped_device *md)
        queue_work(md->wq, &md->work);
 }
-static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
-{
-        struct dm_rq_target_io *tio = clone->end_io_data;
-        tio->info.target_request_nr = request_nr;
-}
-/* Issue barrier requests to targets and wait for their completion. */
-static int dm_rq_barrier(struct mapped_device *md)
-{
-        int i, j;
-        struct dm_table *map = dm_get_live_table(md);
-        unsigned num_targets = dm_table_get_num_targets(map);
-        struct dm_target *ti;
-        struct request *clone;
-        md->barrier_error = 0;
-        for (i = 0; i < num_targets; i++) {
-                ti = dm_table_get_target(map, i);
-                for (j = 0; j < ti->num_flush_requests; j++) {
-                        clone = clone_rq(md->flush_request, md, GFP_NOIO);
-                        dm_rq_set_target_request_nr(clone, j);
-                        atomic_inc(&md->pending[rq_data_dir(clone)]);
-                        map_request(ti, clone, md);
-                }
-        }
-        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-        dm_table_put(map);
-        return md->barrier_error;
-}
-static void dm_rq_barrier_work(struct work_struct *work)
-{
-        int error;
-        struct mapped_device *md = container_of(work, struct mapped_device,
-                                                barrier_work);
-        struct request_queue *q = md->queue;
-        struct request *rq;
-        unsigned long flags;
-        /*
-         * Hold the md reference here and leave it at the last part so that
-         * the md can't be deleted by device opener when the barrier request
-         * completes.
-         */
-        dm_get(md);
-        error = dm_rq_barrier(md);
-        rq = md->flush_request;
-        md->flush_request = NULL;
-        if (error == DM_ENDIO_REQUEUE) {
-                spin_lock_irqsave(q->queue_lock, flags);
-                blk_requeue_request(q, rq);
-                spin_unlock_irqrestore(q->queue_lock, flags);
-        } else
-                blk_end_request_all(rq, error);
-        blk_run_queue(q);
-        dm_put(md);
-}
 /*
 * Swap in a new table, returning the old one for the caller to destroy.
 */
@@ -2677,23 +2447,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
         *
         * To get all processes out of __split_and_process_bio in dm_request,
         * we take the write lock. To prevent any process from reentering
-         * __split_and_process_bio from dm_request, we set
+         * __split_and_process_bio from dm_request and quiesce the thread
-         * DMF_QUEUE_IO_TO_THREAD.
+         * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
-         *
+         * flush_workqueue(md->wq).
-         * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
-         * and call flush_workqueue(md->wq). flush_workqueue will wait until
-         * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
-         * further calls to __split_and_process_bio from dm_wq_work.
         */
        down_write(&md->io_lock);
        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-        set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
        up_write(&md->io_lock);
        /*
-         * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
+         * Stop md->queue before flushing md->wq in case request-based
-         * can be kicked until md->queue is stopped.  So stop md->queue before
+         * dm defers requests to md->wq from md->queue.
-         * flushing md->wq.
         */
        if (dm_request_based(md))
                stop_queue(md->queue);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ba19060bcf3f..8a2f767f26d8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
        dev_info_t *tmp_dev;
        sector_t start_sector;
-        if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_barrier_request(mddev, bio);
+                md_flush_request(mddev, bio);
                return 0;
        }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dbf822df942a..225815197a3d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -227,12 +227,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
                return 0;
        }
        rcu_read_lock();
-        if (mddev->suspended || mddev->barrier) {
+        if (mddev->suspended) {
                DEFINE_WAIT(__wait);
                for (;;) {
                        prepare_to_wait(&mddev->sb_wait, &__wait,
                                        TASK_UNINTERRUPTIBLE);
-                        if (!mddev->suspended && !mddev->barrier)
+                        if (!mddev->suspended)
                                break;
                        rcu_read_unlock();
                        schedule();
@@ -283,40 +283,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);
 int mddev_congested(mddev_t *mddev, int bits)
 {
-        if (mddev->barrier)
-                return 1;
        return mddev->suspended;
 }
 EXPORT_SYMBOL(mddev_congested);
 /*
- * Generic barrier handling for md
+ * Generic flush handling for md
 */
-#define POST_REQUEST_BARRIER ((void*)1)
+static void md_end_flush(struct bio *bio, int err)
-static void md_end_barrier(struct bio *bio, int err)
 {
        mdk_rdev_t *rdev = bio->bi_private;
        mddev_t *mddev = rdev->mddev;
-        if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
-                set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
        rdev_dec_pending(rdev, mddev);
        if (atomic_dec_and_test(&mddev->flush_pending)) {
-                if (mddev->barrier == POST_REQUEST_BARRIER) {
+                /* The pre-request flush has finished */
-                        /* This was a post-request barrier */
+                schedule_work(&mddev->flush_work);
-                        mddev->barrier = NULL;
-                        wake_up(&mddev->sb_wait);
-                } else
-                        /* The pre-request barrier has finished */
-                        schedule_work(&mddev->barrier_work);
        }
        bio_put(bio);
 }
-static void submit_barriers(mddev_t *mddev)
+static void submit_flushes(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
@@ -333,60 +322,56 @@ static void submit_barriers(mddev_t *mddev)
                        atomic_inc(&rdev->nr_pending);
                        rcu_read_unlock();
                        bi = bio_alloc(GFP_KERNEL, 0);
-                        bi->bi_end_io = md_end_barrier;
+                        bi->bi_end_io = md_end_flush;
                        bi->bi_private = rdev;
                        bi->bi_bdev = rdev->bdev;
                        atomic_inc(&mddev->flush_pending);
-                        submit_bio(WRITE_BARRIER, bi);
+                        submit_bio(WRITE_FLUSH, bi);
                        rcu_read_lock();
                        rdev_dec_pending(rdev, mddev);
                }
        rcu_read_unlock();
 }
-static void md_submit_barrier(struct work_struct *ws)
+static void md_submit_flush_data(struct work_struct *ws)
 {
-        mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
+        mddev_t *mddev = container_of(ws, mddev_t, flush_work);
-        struct bio *bio = mddev->barrier;
+        struct bio *bio = mddev->flush_bio;
        atomic_set(&mddev->flush_pending, 1);
-        if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+        if (bio->bi_size == 0)
-                bio_endio(bio, -EOPNOTSUPP);
-        else if (bio->bi_size == 0)
                /* an empty barrier - all done */
                bio_endio(bio, 0);
        else {
-                bio->bi_rw &= ~REQ_HARDBARRIER;
+                bio->bi_rw &= ~REQ_FLUSH;
                if (mddev->pers->make_request(mddev, bio))
                        generic_make_request(bio);
-                mddev->barrier = POST_REQUEST_BARRIER;
-                submit_barriers(mddev);
        }
        if (atomic_dec_and_test(&mddev->flush_pending)) {
-                mddev->barrier = NULL;
+                mddev->flush_bio = NULL;
                wake_up(&mddev->sb_wait);
        }
 }
-void md_barrier_request(mddev_t *mddev, struct bio *bio)
+void md_flush_request(mddev_t *mddev, struct bio *bio)
 {
        spin_lock_irq(&mddev->write_lock);
        wait_event_lock_irq(mddev->sb_wait,
-                            !mddev->barrier,
+                            !mddev->flush_bio,
                            mddev->write_lock, /*nothing*/);
-        mddev->barrier = bio;
+        mddev->flush_bio = bio;
        spin_unlock_irq(&mddev->write_lock);
        atomic_set(&mddev->flush_pending, 1);
-        INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+        INIT_WORK(&mddev->flush_work, md_submit_flush_data);
-        submit_barriers(mddev);
+        submit_flushes(mddev);
        if (atomic_dec_and_test(&mddev->flush_pending))
-                schedule_work(&mddev->barrier_work);
+                schedule_work(&mddev->flush_work);
 }
-EXPORT_SYMBOL(md_barrier_request);
+EXPORT_SYMBOL(md_flush_request);
 /* Support for plugging.
 * This mirrors the plugging support in request_queue, but does not
@@ -697,31 +682,6 @@ static void super_written(struct bio *bio, int error)
        bio_put(bio);
 }
-static void super_written_barrier(struct bio *bio, int error)
-{
-        struct bio *bio2 = bio->bi_private;
-        mdk_rdev_t *rdev = bio2->bi_private;
-        mddev_t *mddev = rdev->mddev;
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
-            error == -EOPNOTSUPP) {
-                unsigned long flags;
-                /* barriers don't appear to be supported :-( */
-                set_bit(BarriersNotsupp, &rdev->flags);
-                mddev->barriers_work = 0;
-                spin_lock_irqsave(&mddev->write_lock, flags);
-                bio2->bi_next = mddev->biolist;
-                mddev->biolist = bio2;
-                spin_unlock_irqrestore(&mddev->write_lock, flags);
-                wake_up(&mddev->sb_wait);
-                bio_put(bio);
-        } else {
-                bio_put(bio2);
-                bio->bi_private = rdev;
-                super_written(bio, error);
-        }
-}
 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                   sector_t sector, int size, struct page *page)
 {
@@ -730,51 +690,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
         * and decrement it on completion, waking up sb_wait
         * if zero is reached.
         * If an error occurred, call md_error
-         *
-         * As we might need to resubmit the request if REQ_HARDBARRIER
-         * causes ENOTSUPP, we allocate a spare bio...
         */
        struct bio *bio = bio_alloc(GFP_NOIO, 1);
-        int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
        bio->bi_bdev = rdev->bdev;
        bio->bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
        bio->bi_end_io = super_written;
-        bio->bi_rw = rw;
        atomic_inc(&mddev->pending_writes);
-        if (!test_bit(BarriersNotsupp, &rdev->flags)) {
+        submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
-                struct bio *rbio;
+                   bio);
-                rw |= REQ_HARDBARRIER;
-                rbio = bio_clone(bio, GFP_NOIO);
-                rbio->bi_private = bio;
-                rbio->bi_end_io = super_written_barrier;
-                submit_bio(rw, rbio);
-        } else
-                submit_bio(rw, bio);
 }
 void md_super_wait(mddev_t *mddev)
 {
-        /* wait for all superblock writes that were scheduled to complete.
+        /* wait for all superblock writes that were scheduled to complete */
-         * if any had to be retried (due to BARRIER problems), retry them
-         */
        DEFINE_WAIT(wq);
        for(;;) {
                prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
                if (atomic_read(&mddev->pending_writes)==0)
                        break;
-                while (mddev->biolist) {
-                        struct bio *bio;
-                        spin_lock_irq(&mddev->write_lock);
-                        bio = mddev->biolist;
-                        mddev->biolist = bio->bi_next ;
-                        bio->bi_next = NULL;
-                        spin_unlock_irq(&mddev->write_lock);
-                        submit_bio(bio->bi_rw, bio);
-                }
                schedule();
        }
        finish_wait(&mddev->sb_wait, &wq);
@@ -1071,7 +1008,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
-        clear_bit(BarriersNotsupp, &rdev->flags);
        if (mddev->raid_disks == 0) {
                mddev->major_version = 0;
@@ -1486,7 +1422,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
-        clear_bit(BarriersNotsupp, &rdev->flags);
        if (mddev->raid_disks == 0) {
                mddev->major_version = 1;
@@ -4505,7 +4440,6 @@ int md_run(mddev_t *mddev)
        /* may be over-ridden by personality */
        mddev->resync_max_sectors = mddev->dev_sectors;
-        mddev->barriers_work = 1;
        mddev->ok_start_degraded = start_dirty_degraded;
        if (start_readonly && mddev->ro == 0)
@@ -4684,7 +4618,6 @@ static void md_clean(mddev_t *mddev)
        mddev->recovery = 0;
        mddev->in_sync = 0;
        mddev->degraded = 0;
-        mddev->barriers_work = 0;
        mddev->safemode = 0;
        mddev->bitmap_info.offset = 0;
        mddev->bitmap_info.default_offset = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 3931299788dc..112a2c32db0c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -87,7 +87,6 @@ struct mdk_rdev_s
 #define Faulty          1               /* device is known to have a fault */
 #define In_sync         2               /* device is in_sync with rest of array */
 #define WriteMostly     4               /* Avoid reading if at all possible */
-#define BarriersNotsupp 5               /* REQ_HARDBARRIER is not supported */
 #define AllReserved     6               /* If whole device is reserved for
                                         * one array */
 #define AutoDetected    7               /* added by auto-detect */
@@ -273,13 +272,6 @@ struct mddev_s
        int                             degraded;       /* whether md should consider
                                                         * adding a spare
                                                         */
-        int                             barriers_work;  /* initialised to true, cleared as soon
-                                                         * as a barrier request to slave
-                                                         * fails.  Only supported
-                                                         */
-        struct bio                      *biolist;       /* bios that need to be retried
-                                                         * because REQ_HARDBARRIER is not supported
-                                                         */
        atomic_t                        recovery_active; /* blocks scheduled, but not written */
        wait_queue_head_t               recovery_wait;
@@ -339,16 +331,13 @@ struct mddev_s
        struct attribute_group          *to_remove;
        struct plug_handle              *plug; /* if used by personality */
-        /* Generic barrier handling.
+        /* Generic flush handling.
-         * If there is a pending barrier request, all other
+         * The last to finish preflush schedules a worker to submit
-         * writes are blocked while the devices are flushed.
+         * the rest of the request (without the REQ_FLUSH flag).
-         * The last to finish a flush schedules a worker to
-         * submit the barrier request (without the barrier flag),
-         * then submit more flush requests.
         */
-        struct bio *barrier;
+        struct bio *flush_bio;
        atomic_t flush_pending;
-        struct work_struct barrier_work;
+        struct work_struct flush_work;
        struct work_struct event_work;  /* used by dm to report failure event */
 };
@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
 extern int mddev_congested(mddev_t *mddev, int bits);
-extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
+extern void md_flush_request(mddev_t *mddev, struct bio *bio);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                           sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0307d217e7a4..6d7ddf32ef2e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
        struct multipath_bh * mp_bh;
        struct multipath_info *multipath;
-        if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_barrier_request(mddev, bio);
+                md_flush_request(mddev, bio);
                return 0;
        }
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f7af46d623c..a39f4c355e55 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
        struct strip_zone *zone;
        mdk_rdev_t *tmp_dev;
-        if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_barrier_request(mddev, bio);
+                md_flush_request(mddev, bio);
                return 0;
        }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0b830bbe1d8b..378a25894c57 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
                if (r1_bio->bios[mirror] == bio)
                        break;
-        if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
+        /*
-                set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
+         * 'one mirror IO has finished' event handler:
-                set_bit(R1BIO_BarrierRetry, &r1_bio->state);
+         */
-                r1_bio->mddev->barriers_work = 0;
+        r1_bio->bios[mirror] = NULL;
-                /* Don't rdev_dec_pending in this branch - keep it for the retry */
+        to_put = bio;
-        } else {
+        if (!uptodate) {
+                md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+                /* an I/O failed, we can't clear the bitmap */
+                set_bit(R1BIO_Degraded, &r1_bio->state);
+        } else
                /*
-                 * this branch is our 'one mirror IO has finished' event handler:
+                 * Set R1BIO_Uptodate in our master bio, so that we
+                 * will return a good error code for to the higher
+                 * levels even if IO on some other mirrored buffer
+                 * fails.
+                 *
+                 * The 'master' represents the composite IO operation
+                 * to user-side. So if something waits for IO, then it
+                 * will wait for the 'master' bio.
                 */
-                r1_bio->bios[mirror] = NULL;
+                set_bit(R1BIO_Uptodate, &r1_bio->state);
-                to_put = bio;
-                if (!uptodate) {
+        update_head_pos(mirror, r1_bio);
-                        md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-                        /* an I/O failed, we can't clear the bitmap */
+        if (behind) {
-                        set_bit(R1BIO_Degraded, &r1_bio->state);
+                if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
-                } else
+                        atomic_dec(&r1_bio->behind_remaining);
-                        /*
-                         * Set R1BIO_Uptodate in our master bio, so that
+                /*
-                         * we will return a good error code for to the higher
+                 * In behind mode, we ACK the master bio once the I/O
-                         * levels even if IO on some other mirrored buffer fails.
+                 * has safely reached all non-writemostly
-                         *
+                 * disks. Setting the Returned bit ensures that this
-                         * The 'master' represents the composite IO operation to
+                 * gets done only once -- we don't ever want to return
-                         * user-side. So if something waits for IO, then it will
+                 * -EIO here, instead we'll wait
-                         * wait for the 'master' bio.
+                 */
-                         */
+                if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
-                        set_bit(R1BIO_Uptodate, &r1_bio->state);
+                    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+                        /* Maybe we can return now */
-                update_head_pos(mirror, r1_bio);
+                        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+                                struct bio *mbio = r1_bio->master_bio;
-                if (behind) {
+                                PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-                        if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+                                       (unsigned long long) mbio->bi_sector,
-                                atomic_dec(&r1_bio->behind_remaining);
+                                       (unsigned long long) mbio->bi_sector +
+                                       (mbio->bi_size >> 9) - 1);
-                        /* In behind mode, we ACK the master bio once the I/O has safely
+                                bio_endio(mbio, 0);
-                         * reached all non-writemostly disks. Setting the Returned bit
-                         * ensures that this gets done only once -- we don't ever want to
-                         * return -EIO here, instead we'll wait */
-                        if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
-                            test_bit(R1BIO_Uptodate, &r1_bio->state)) {
-                                /* Maybe we can return now */
-                                if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
-                                        struct bio *mbio = r1_bio->master_bio;
-                                        PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-                                               (unsigned long long) mbio->bi_sector,
-                                               (unsigned long long) mbio->bi_sector +
-                                               (mbio->bi_size >> 9) - 1);
-                                        bio_endio(mbio, 0);
-                                }
                        }
                }
-                rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
        }
+        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
        /*
-         *
         * Let's see if all mirrored write operations have finished
         * already.
         */
        if (atomic_dec_and_test(&r1_bio->remaining)) {
-                if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
+                if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-                        reschedule_retry(r1_bio);
+                        /* free extra copy of the data pages */
-                else {
+                        int i = bio->bi_vcnt;
-                        /* it really is the end of this request */
+                        while (i--)
-                        if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+                                safe_put_page(bio->bi_io_vec[i].bv_page);
-                                /* free extra copy of the data pages */
-                                int i = bio->bi_vcnt;
-                                while (i--)
-                                        safe_put_page(bio->bi_io_vec[i].bv_page);
-                        }
-                        /* clear the bitmap if all writes complete successfully */
-                        bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
-                                        r1_bio->sectors,
-                                        !test_bit(R1BIO_Degraded, &r1_bio->state),
-                                        behind);
-                        md_write_end(r1_bio->mddev);
-                        raid_end_bio_io(r1_bio);
                }
+                /* clear the bitmap if all writes complete successfully */
+                bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+                                r1_bio->sectors,
+                                !test_bit(R1BIO_Degraded, &r1_bio->state),
+                                behind);
+                md_write_end(r1_bio->mddev);
+                raid_end_bio_io(r1_bio);
        }
        if (to_put)
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        struct page **behind_pages = NULL;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
-        unsigned long do_barriers;
+        const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
        mdk_rdev_t *blocked_rdev;
        /*
         * Register the new request and wait if the reconstruction
         * thread has put up a bar for new requests.
         * Continue immediately if no resync is active currently.
-         * We test barriers_work *after* md_write_start as md_write_start
-         * may cause the first superblock write, and that will check out
-         * if barriers work.
         */
        md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                }
                finish_wait(&conf->wait_barrier, &w);
        }
-        if (unlikely(!mddev->barriers_work &&
-                     (bio->bi_rw & REQ_HARDBARRIER))) {
-                if (rw == WRITE)
-                        md_write_end(mddev);
-                bio_endio(bio, -EOPNOTSUPP);
-                return 0;
-        }
        wait_barrier(conf);
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        atomic_set(&r1_bio->remaining, 0);
        atomic_set(&r1_bio->behind_remaining, 0);
-        do_barriers = bio->bi_rw & REQ_HARDBARRIER;
-        if (do_barriers)
-                set_bit(R1BIO_Barrier, &r1_bio->state);
        bio_list_init(&bl);
        for (i = 0; i < disks; i++) {
                struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
-                mbio->bi_rw = WRITE | do_barriers | do_sync;
+                mbio->bi_rw = WRITE | do_flush_fua | do_sync;
                mbio->bi_private = r1_bio;
                if (behind_pages) {
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
                if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
                        sync_request_write(mddev, r1_bio);
                        unplug = 1;
-                } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
-                        /* some requests in the r1bio were REQ_HARDBARRIER
-                         * requests which failed with -EOPNOTSUPP.  Hohumm..
-                         * Better resubmit without the barrier.
-                         * We know which devices to resubmit for, because
-                         * all others have had their bios[] entry cleared.
-                         * We already have a nr_pending reference on these rdevs.
-                         */
-                        int i;
-                        const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
-                        clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
-                        clear_bit(R1BIO_Barrier, &r1_bio->state);
-                        for (i=0; i < conf->raid_disks; i++)
-                                if (r1_bio->bios[i])
-                                        atomic_inc(&r1_bio->remaining);
-                        for (i=0; i < conf->raid_disks; i++)
-                                if (r1_bio->bios[i]) {
-                                        struct bio_vec *bvec;
-                                        int j;
-                                        bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
-                                        /* copy pages from the failed bio, as
-                                         * this might be a write-behind device */
-                                        __bio_for_each_segment(bvec, bio, j, 0)
-                                                bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
-                                        bio_put(r1_bio->bios[i]);
-                                        bio->bi_sector = r1_bio->sector +
-                                                conf->mirrors[i].rdev->data_offset;
-                                        bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-                                        bio->bi_end_io = raid1_end_write_request;
-                                        bio->bi_rw = WRITE | do_sync;
-                                        bio->bi_private = r1_bio;
-                                        r1_bio->bios[i] = bio;
-                                        generic_make_request(bio);
-                                }
                } else {
                        int disk;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5f2d443ae28a..adf8cfd73313 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -117,8 +117,6 @@ struct r1bio_s {
 #define R1BIO_IsSync    1
 #define R1BIO_Degraded  2
 #define R1BIO_BehindIO  3
-#define R1BIO_Barrier   4
-#define R1BIO_BarrierRetry 5
 /* For write-behind requests, we call bi_end_io when
 * the last non-write-behind device completes, providing
 * any write was successful.  Otherwise we call when
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124d..f0d082f749be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -800,12 +800,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        int chunk_sects = conf->chunk_mask + 1;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
+        const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
        struct bio_list bl;
        unsigned long flags;
        mdk_rdev_t *blocked_rdev;
-        if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_barrier_request(mddev, bio);
+                md_flush_request(mddev, bio);
                return 0;
        }
@@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                        conf->mirrors[d].rdev->data_offset;
                mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
                mbio->bi_end_io = raid10_end_write_request;
-                mbio->bi_rw = WRITE | do_sync;
+                mbio->bi_rw = WRITE | do_sync | do_fua;
                mbio->bi_private = r10_bio;
                atomic_inc(&r10_bio->remaining);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69b0a169e43d..31140d1259dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                int rw;
                struct bio *bi;
                mdk_rdev_t *rdev;
-                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
-                        rw = WRITE;
+                        if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
-                else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+                                rw = WRITE_FUA;
+                        else
+                                rw = WRITE;
+                } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
                        rw = READ;
                else
                        continue;
@@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                        while (wbi && wbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
+                                if (wbi->bi_rw & REQ_FUA)
+                                        set_bit(R5_WantFUA, &dev->flags);
                                tx = async_copy_data(1, wbi, dev->page,
                                        dev->sector, tx);
                                wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
        int pd_idx = sh->pd_idx;
        int qd_idx = sh->qd_idx;
        int i;
+        bool fua = false;
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
+        for (i = disks; i--; )
+                fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
        for (i = disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
-                if (dev->written || i == pd_idx || i == qd_idx)
+                if (dev->written || i == pd_idx || i == qd_idx) {
                        set_bit(R5_UPTODATE, &dev->flags);
+                        if (fua)
+                                set_bit(R5_WantFUA, &dev->flags);
+                }
        }
        if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh)
        if (dec_preread_active) {
                /* We delay this until after ops_run_io so that if make_request
-                 * is waiting on a barrier, it won't continue until the writes
+                 * is waiting on a flush, it won't continue until the writes
                 * have actually been submitted.
                 */
                atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh)
        if (dec_preread_active) {
                /* We delay this until after ops_run_io so that if make_request
-                 * is waiting on a barrier, it won't continue until the writes
+                 * is waiting on a flush, it won't continue until the writes
                 * have actually been submitted.
                 */
                atomic_dec(&conf->preread_active_stripes);
@@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi)
        const int rw = bio_data_dir(bi);
        int remaining;
-        if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bi->bi_rw & REQ_FLUSH)) {
-                /* Drain all pending writes.  We only really need
+                md_flush_request(mddev, bi);
-                 * to ensure they have been submitted, but this is
-                 * easier.
-                 */
-                mddev->pers->quiesce(mddev, 1);
-                mddev->pers->quiesce(mddev, 0);
-                md_barrier_request(mddev, bi);
                return 0;
        }
@@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
                        finish_wait(&conf->wait_for_overlap, &w);
                        set_bit(STRIPE_HANDLE, &sh->state);
                        clear_bit(STRIPE_DELAYED, &sh->state);
-                        if (mddev->barrier && 
+                        if ((bi->bi_rw & REQ_SYNC) &&
                            !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                atomic_inc(&conf->preread_active_stripes);
                        release_stripe(sh);
@@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
                bio_endio(bi, 0);
        }
-        if (mddev->barrier) {
-                /* We need to wait for the stripes to all be handled.
-                 * So: wait for preread_active_stripes to drop to 0.
-                 */
-                wait_event(mddev->thread->wqueue,
-                           atomic_read(&conf->preread_active_stripes) == 0);
-        }
        return 0;
 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 36eaed5dfd6e..2ace0582b409 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -275,6 +275,7 @@ struct r6_state {
                                    * filling
                                    */
 #define R5_Wantdrain    13 /* dev->towrite needs to be drained */
+#define R5_WantFUA      14      /* Write should be FUA */
 /*
 * Write method
 */
author	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-22 20:07:18 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-22 20:07:18 -0400
commit	a2887097f25cd38cadfc11d10769e2b349fb5eca (patch)
tree	cd4adcb305365d6ba9acd2c02d4eb9d0125c6f8d /drivers/md
parent	8abfc6e7a45eb74e51904bbae676fae008b11366 (diff)
parent	005a1d15f5a6b2bb4ada80349513effbf22b4588 (diff)