aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-io.c20
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-raid1.c8
-rw-r--r--drivers/md/dm-region-hash.c16
-rw-r--r--drivers/md/dm-snap-persistent.c2
-rw-r--r--drivers/md/dm-snap.c6
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm.c398
-rw-r--r--drivers/md/linear.c4
-rw-r--r--drivers/md/md.c117
-rw-r--r--drivers/md/md.h23
-rw-r--r--drivers/md/multipath.c4
-rw-r--r--drivers/md/raid0.c4
-rw-r--r--drivers/md/raid1.c176
-rw-r--r--drivers/md/raid1.h2
-rw-r--r--drivers/md/raid10.c7
-rw-r--r--drivers/md/raid5.c43
-rw-r--r--drivers/md/raid5.h1
19 files changed, 226 insertions, 611 deletions
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 368e8e98f705..d5b0e4c0e702 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1278,7 +1278,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1278 struct dm_crypt_io *io; 1278 struct dm_crypt_io *io;
1279 struct crypt_config *cc; 1279 struct crypt_config *cc;
1280 1280
1281 if (unlikely(bio_empty_barrier(bio))) { 1281 if (bio->bi_rw & REQ_FLUSH) {
1282 cc = ti->private; 1282 cc = ti->private;
1283 bio->bi_bdev = cc->dev->bdev; 1283 bio->bi_bdev = cc->dev->bdev;
1284 return DM_MAPIO_REMAPPED; 1284 return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 0590c75b0ab6..136d4f71a116 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -31,7 +31,6 @@ struct dm_io_client {
31 */ 31 */
32struct io { 32struct io {
33 unsigned long error_bits; 33 unsigned long error_bits;
34 unsigned long eopnotsupp_bits;
35 atomic_t count; 34 atomic_t count;
36 struct task_struct *sleeper; 35 struct task_struct *sleeper;
37 struct dm_io_client *client; 36 struct dm_io_client *client;
@@ -130,11 +129,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
130 *---------------------------------------------------------------*/ 129 *---------------------------------------------------------------*/
131static void dec_count(struct io *io, unsigned int region, int error) 130static void dec_count(struct io *io, unsigned int region, int error)
132{ 131{
133 if (error) { 132 if (error)
134 set_bit(region, &io->error_bits); 133 set_bit(region, &io->error_bits);
135 if (error == -EOPNOTSUPP)
136 set_bit(region, &io->eopnotsupp_bits);
137 }
138 134
139 if (atomic_dec_and_test(&io->count)) { 135 if (atomic_dec_and_test(&io->count)) {
140 if (io->sleeper) 136 if (io->sleeper)
@@ -310,8 +306,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
310 sector_t remaining = where->count; 306 sector_t remaining = where->count;
311 307
312 /* 308 /*
313 * where->count may be zero if rw holds a write barrier and we 309 * where->count may be zero if rw holds a flush and we need to
314 * need to send a zero-sized barrier. 310 * send a zero-sized flush.
315 */ 311 */
316 do { 312 do {
317 /* 313 /*
@@ -364,7 +360,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
364 */ 360 */
365 for (i = 0; i < num_regions; i++) { 361 for (i = 0; i < num_regions; i++) {
366 *dp = old_pages; 362 *dp = old_pages;
367 if (where[i].count || (rw & REQ_HARDBARRIER)) 363 if (where[i].count || (rw & REQ_FLUSH))
368 do_region(rw, i, where + i, dp, io); 364 do_region(rw, i, where + i, dp, io);
369 } 365 }
370 366
@@ -393,9 +389,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
393 return -EIO; 389 return -EIO;
394 } 390 }
395 391
396retry:
397 io->error_bits = 0; 392 io->error_bits = 0;
398 io->eopnotsupp_bits = 0;
399 atomic_set(&io->count, 1); /* see dispatch_io() */ 393 atomic_set(&io->count, 1); /* see dispatch_io() */
400 io->sleeper = current; 394 io->sleeper = current;
401 io->client = client; 395 io->client = client;
@@ -412,11 +406,6 @@ retry:
412 } 406 }
413 set_current_state(TASK_RUNNING); 407 set_current_state(TASK_RUNNING);
414 408
415 if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
416 rw &= ~REQ_HARDBARRIER;
417 goto retry;
418 }
419
420 if (error_bits) 409 if (error_bits)
421 *error_bits = io->error_bits; 410 *error_bits = io->error_bits;
422 411
@@ -437,7 +426,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
437 426
438 io = mempool_alloc(client->pool, GFP_NOIO); 427 io = mempool_alloc(client->pool, GFP_NOIO);
439 io->error_bits = 0; 428 io->error_bits = 0;
440 io->eopnotsupp_bits = 0;
441 atomic_set(&io->count, 1); /* see dispatch_io() */ 429 atomic_set(&io->count, 1); /* see dispatch_io() */
442 io->sleeper = NULL; 430 io->sleeper = NULL;
443 io->client = client; 431 io->client = client;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5a08be0222db..33420e68d153 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc)
300 .count = 0, 300 .count = 0,
301 }; 301 };
302 302
303 lc->io_req.bi_rw = WRITE_BARRIER; 303 lc->io_req.bi_rw = WRITE_FLUSH;
304 304
305 return dm_io(&lc->io_req, 1, &null_location, NULL); 305 return dm_io(&lc->io_req, 1, &null_location, NULL);
306} 306}
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 7c081bcbc3cf..19a59b041c27 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -259,7 +259,7 @@ static int mirror_flush(struct dm_target *ti)
259 struct dm_io_region io[ms->nr_mirrors]; 259 struct dm_io_region io[ms->nr_mirrors];
260 struct mirror *m; 260 struct mirror *m;
261 struct dm_io_request io_req = { 261 struct dm_io_request io_req = {
262 .bi_rw = WRITE_BARRIER, 262 .bi_rw = WRITE_FLUSH,
263 .mem.type = DM_IO_KMEM, 263 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL, 264 .mem.ptr.bvec = NULL,
265 .client = ms->io_client, 265 .client = ms->io_client,
@@ -629,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
629 struct dm_io_region io[ms->nr_mirrors], *dest = io; 629 struct dm_io_region io[ms->nr_mirrors], *dest = io;
630 struct mirror *m; 630 struct mirror *m;
631 struct dm_io_request io_req = { 631 struct dm_io_request io_req = {
632 .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), 632 .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
633 .mem.type = DM_IO_BVEC, 633 .mem.type = DM_IO_BVEC,
634 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 634 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
635 .notify.fn = write_callback, 635 .notify.fn = write_callback,
@@ -670,7 +670,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
670 bio_list_init(&requeue); 670 bio_list_init(&requeue);
671 671
672 while ((bio = bio_list_pop(writes))) { 672 while ((bio = bio_list_pop(writes))) {
673 if (unlikely(bio_empty_barrier(bio))) { 673 if (bio->bi_rw & REQ_FLUSH) {
674 bio_list_add(&sync, bio); 674 bio_list_add(&sync, bio);
675 continue; 675 continue;
676 } 676 }
@@ -1203,7 +1203,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1203 * We need to dec pending if this was a write. 1203 * We need to dec pending if this was a write.
1204 */ 1204 */
1205 if (rw == WRITE) { 1205 if (rw == WRITE) {
1206 if (likely(!bio_empty_barrier(bio))) 1206 if (!(bio->bi_rw & REQ_FLUSH))
1207 dm_rh_dec(ms->rh, map_context->ll); 1207 dm_rh_dec(ms->rh, map_context->ll);
1208 return error; 1208 return error;
1209 } 1209 }
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index bd5c58b28868..dad011aed0c9 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -81,9 +81,9 @@ struct dm_region_hash {
81 struct list_head failed_recovered_regions; 81 struct list_head failed_recovered_regions;
82 82
83 /* 83 /*
84 * If there was a barrier failure no regions can be marked clean. 84 * If there was a flush failure no regions can be marked clean.
85 */ 85 */
86 int barrier_failure; 86 int flush_failure;
87 87
88 void *context; 88 void *context;
89 sector_t target_begin; 89 sector_t target_begin;
@@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
217 INIT_LIST_HEAD(&rh->quiesced_regions); 217 INIT_LIST_HEAD(&rh->quiesced_regions);
218 INIT_LIST_HEAD(&rh->recovered_regions); 218 INIT_LIST_HEAD(&rh->recovered_regions);
219 INIT_LIST_HEAD(&rh->failed_recovered_regions); 219 INIT_LIST_HEAD(&rh->failed_recovered_regions);
220 rh->barrier_failure = 0; 220 rh->flush_failure = 0;
221 221
222 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 222 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
223 sizeof(struct dm_region)); 223 sizeof(struct dm_region));
@@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
399 region_t region = dm_rh_bio_to_region(rh, bio); 399 region_t region = dm_rh_bio_to_region(rh, bio);
400 int recovering = 0; 400 int recovering = 0;
401 401
402 if (bio_empty_barrier(bio)) { 402 if (bio->bi_rw & REQ_FLUSH) {
403 rh->barrier_failure = 1; 403 rh->flush_failure = 1;
404 return; 404 return;
405 } 405 }
406 406
@@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
524 struct bio *bio; 524 struct bio *bio;
525 525
526 for (bio = bios->head; bio; bio = bio->bi_next) { 526 for (bio = bios->head; bio; bio = bio->bi_next) {
527 if (bio_empty_barrier(bio)) 527 if (bio->bi_rw & REQ_FLUSH)
528 continue; 528 continue;
529 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 529 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
530 } 530 }
@@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
555 */ 555 */
556 556
557 /* do nothing for DM_RH_NOSYNC */ 557 /* do nothing for DM_RH_NOSYNC */
558 if (unlikely(rh->barrier_failure)) { 558 if (unlikely(rh->flush_failure)) {
559 /* 559 /*
560 * If a write barrier failed some time ago, we 560 * If a write flush failed some time ago, we
561 * don't know whether or not this write made it 561 * don't know whether or not this write made it
562 * to the disk, so we must resync the device. 562 * to the disk, so we must resync the device.
563 */ 563 */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index cc2bdb83f9ad..0b61792a2780 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -687,7 +687,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
687 /* 687 /*
688 * Commit exceptions to disk. 688 * Commit exceptions to disk.
689 */ 689 */
690 if (ps->valid && area_io(ps, WRITE_BARRIER)) 690 if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
691 ps->valid = 0; 691 ps->valid = 0;
692 692
693 /* 693 /*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f30f6e8d594e..53cf79d8bcbc 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1585,7 +1585,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1585 chunk_t chunk; 1585 chunk_t chunk;
1586 struct dm_snap_pending_exception *pe = NULL; 1586 struct dm_snap_pending_exception *pe = NULL;
1587 1587
1588 if (unlikely(bio_empty_barrier(bio))) { 1588 if (bio->bi_rw & REQ_FLUSH) {
1589 bio->bi_bdev = s->cow->bdev; 1589 bio->bi_bdev = s->cow->bdev;
1590 return DM_MAPIO_REMAPPED; 1590 return DM_MAPIO_REMAPPED;
1591 } 1591 }
@@ -1689,7 +1689,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1689 int r = DM_MAPIO_REMAPPED; 1689 int r = DM_MAPIO_REMAPPED;
1690 chunk_t chunk; 1690 chunk_t chunk;
1691 1691
1692 if (unlikely(bio_empty_barrier(bio))) { 1692 if (bio->bi_rw & REQ_FLUSH) {
1693 if (!map_context->target_request_nr) 1693 if (!map_context->target_request_nr)
1694 bio->bi_bdev = s->origin->bdev; 1694 bio->bi_bdev = s->origin->bdev;
1695 else 1695 else
@@ -2133,7 +2133,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
2133 struct dm_dev *dev = ti->private; 2133 struct dm_dev *dev = ti->private;
2134 bio->bi_bdev = dev->bdev; 2134 bio->bi_bdev = dev->bdev;
2135 2135
2136 if (unlikely(bio_empty_barrier(bio))) 2136 if (bio->bi_rw & REQ_FLUSH)
2137 return DM_MAPIO_REMAPPED; 2137 return DM_MAPIO_REMAPPED;
2138 2138
2139 /* Only tell snapshots if this is a write */ 2139 /* Only tell snapshots if this is a write */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c297f6da91ea..f0371b4c4fbf 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -271,7 +271,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
271 uint32_t stripe; 271 uint32_t stripe;
272 unsigned target_request_nr; 272 unsigned target_request_nr;
273 273
274 if (unlikely(bio_empty_barrier(bio))) { 274 if (bio->bi_rw & REQ_FLUSH) {
275 target_request_nr = map_context->target_request_nr; 275 target_request_nr = map_context->target_request_nr;
276 BUG_ON(target_request_nr >= sc->stripes); 276 BUG_ON(target_request_nr >= sc->stripes);
277 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; 277 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7967eca5a2d5..7cb1352f7e7a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -110,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
110#define DMF_FREEING 3 110#define DMF_FREEING 3
111#define DMF_DELETING 4 111#define DMF_DELETING 4
112#define DMF_NOFLUSH_SUSPENDING 5 112#define DMF_NOFLUSH_SUSPENDING 5
113#define DMF_QUEUE_IO_TO_THREAD 6
114 113
115/* 114/*
116 * Work processed by per-device workqueue. 115 * Work processed by per-device workqueue.
@@ -144,24 +143,9 @@ struct mapped_device {
144 spinlock_t deferred_lock; 143 spinlock_t deferred_lock;
145 144
146 /* 145 /*
147 * An error from the barrier request currently being processed. 146 * Processing queue (flush)
148 */
149 int barrier_error;
150
151 /*
152 * Protect barrier_error from concurrent endio processing
153 * in request-based dm.
154 */
155 spinlock_t barrier_error_lock;
156
157 /*
158 * Processing queue (flush/barriers)
159 */ 147 */
160 struct workqueue_struct *wq; 148 struct workqueue_struct *wq;
161 struct work_struct barrier_work;
162
163 /* A pointer to the currently processing pre/post flush request */
164 struct request *flush_request;
165 149
166 /* 150 /*
167 * The current mapping. 151 * The current mapping.
@@ -200,8 +184,8 @@ struct mapped_device {
200 /* sysfs handle */ 184 /* sysfs handle */
201 struct kobject kobj; 185 struct kobject kobj;
202 186
203 /* zero-length barrier that will be cloned and submitted to targets */ 187 /* zero-length flush that will be cloned and submitted to targets */
204 struct bio barrier_bio; 188 struct bio flush_bio;
205}; 189};
206 190
207/* 191/*
@@ -512,7 +496,7 @@ static void end_io_acct(struct dm_io *io)
512 496
513 /* 497 /*
514 * After this is decremented the bio must not be touched if it is 498 * After this is decremented the bio must not be touched if it is
515 * a barrier. 499 * a flush.
516 */ 500 */
517 dm_disk(md)->part0.in_flight[rw] = pending = 501 dm_disk(md)->part0.in_flight[rw] = pending =
518 atomic_dec_return(&md->pending[rw]); 502 atomic_dec_return(&md->pending[rw]);
@@ -528,16 +512,12 @@ static void end_io_acct(struct dm_io *io)
528 */ 512 */
529static void queue_io(struct mapped_device *md, struct bio *bio) 513static void queue_io(struct mapped_device *md, struct bio *bio)
530{ 514{
531 down_write(&md->io_lock); 515 unsigned long flags;
532 516
533 spin_lock_irq(&md->deferred_lock); 517 spin_lock_irqsave(&md->deferred_lock, flags);
534 bio_list_add(&md->deferred, bio); 518 bio_list_add(&md->deferred, bio);
535 spin_unlock_irq(&md->deferred_lock); 519 spin_unlock_irqrestore(&md->deferred_lock, flags);
536 520 queue_work(md->wq, &md->work);
537 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
538 queue_work(md->wq, &md->work);
539
540 up_write(&md->io_lock);
541} 521}
542 522
543/* 523/*
@@ -625,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error)
625 * Target requested pushing back the I/O. 605 * Target requested pushing back the I/O.
626 */ 606 */
627 spin_lock_irqsave(&md->deferred_lock, flags); 607 spin_lock_irqsave(&md->deferred_lock, flags);
628 if (__noflush_suspending(md)) { 608 if (__noflush_suspending(md))
629 if (!(io->bio->bi_rw & REQ_HARDBARRIER)) 609 bio_list_add_head(&md->deferred, io->bio);
630 bio_list_add_head(&md->deferred, 610 else
631 io->bio);
632 } else
633 /* noflush suspend was interrupted. */ 611 /* noflush suspend was interrupted. */
634 io->error = -EIO; 612 io->error = -EIO;
635 spin_unlock_irqrestore(&md->deferred_lock, flags); 613 spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -637,32 +615,23 @@ static void dec_pending(struct dm_io *io, int error)
637 615
638 io_error = io->error; 616 io_error = io->error;
639 bio = io->bio; 617 bio = io->bio;
618 end_io_acct(io);
619 free_io(md, io);
620
621 if (io_error == DM_ENDIO_REQUEUE)
622 return;
640 623
641 if (bio->bi_rw & REQ_HARDBARRIER) { 624 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
642 /* 625 /*
643 * There can be just one barrier request so we use 626 * Preflush done for flush with data, reissue
644 * a per-device variable for error reporting. 627 * without REQ_FLUSH.
645 * Note that you can't touch the bio after end_io_acct
646 *
647 * We ignore -EOPNOTSUPP for empty flush reported by
648 * underlying devices. We assume that if the device
649 * doesn't support empty barriers, it doesn't need
650 * cache flushing commands.
651 */ 628 */
652 if (!md->barrier_error && 629 bio->bi_rw &= ~REQ_FLUSH;
653 !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) 630 queue_io(md, bio);
654 md->barrier_error = io_error;
655 end_io_acct(io);
656 free_io(md, io);
657 } else { 631 } else {
658 end_io_acct(io); 632 /* done with normal IO or empty flush */
659 free_io(md, io); 633 trace_block_bio_complete(md->queue, bio);
660 634 bio_endio(bio, io_error);
661 if (io_error != DM_ENDIO_REQUEUE) {
662 trace_block_bio_complete(md->queue, bio);
663
664 bio_endio(bio, io_error);
665 }
666 } 635 }
667 } 636 }
668} 637}
@@ -755,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error)
755 blk_update_request(tio->orig, 0, nr_bytes); 724 blk_update_request(tio->orig, 0, nr_bytes);
756} 725}
757 726
758static void store_barrier_error(struct mapped_device *md, int error)
759{
760 unsigned long flags;
761
762 spin_lock_irqsave(&md->barrier_error_lock, flags);
763 /*
764 * Basically, the first error is taken, but:
765 * -EOPNOTSUPP supersedes any I/O error.
766 * Requeue request supersedes any I/O error but -EOPNOTSUPP.
767 */
768 if (!md->barrier_error || error == -EOPNOTSUPP ||
769 (md->barrier_error != -EOPNOTSUPP &&
770 error == DM_ENDIO_REQUEUE))
771 md->barrier_error = error;
772 spin_unlock_irqrestore(&md->barrier_error_lock, flags);
773}
774
775/* 727/*
776 * Don't touch any member of the md after calling this function because 728 * Don't touch any member of the md after calling this function because
777 * the md may be freed in dm_put() at the end of this function. 729 * the md may be freed in dm_put() at the end of this function.
@@ -809,13 +761,11 @@ static void free_rq_clone(struct request *clone)
809static void dm_end_request(struct request *clone, int error) 761static void dm_end_request(struct request *clone, int error)
810{ 762{
811 int rw = rq_data_dir(clone); 763 int rw = rq_data_dir(clone);
812 int run_queue = 1;
813 bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
814 struct dm_rq_target_io *tio = clone->end_io_data; 764 struct dm_rq_target_io *tio = clone->end_io_data;
815 struct mapped_device *md = tio->md; 765 struct mapped_device *md = tio->md;
816 struct request *rq = tio->orig; 766 struct request *rq = tio->orig;
817 767
818 if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { 768 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
819 rq->errors = clone->errors; 769 rq->errors = clone->errors;
820 rq->resid_len = clone->resid_len; 770 rq->resid_len = clone->resid_len;
821 771
@@ -829,15 +779,8 @@ static void dm_end_request(struct request *clone, int error)
829 } 779 }
830 780
831 free_rq_clone(clone); 781 free_rq_clone(clone);
832 782 blk_end_request_all(rq, error);
833 if (unlikely(is_barrier)) { 783 rq_completed(md, rw, true);
834 if (unlikely(error))
835 store_barrier_error(md, error);
836 run_queue = 0;
837 } else
838 blk_end_request_all(rq, error);
839
840 rq_completed(md, rw, run_queue);
841} 784}
842 785
843static void dm_unprep_request(struct request *rq) 786static void dm_unprep_request(struct request *rq)
@@ -862,16 +805,6 @@ void dm_requeue_unmapped_request(struct request *clone)
862 struct request_queue *q = rq->q; 805 struct request_queue *q = rq->q;
863 unsigned long flags; 806 unsigned long flags;
864 807
865 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
866 /*
867 * Barrier clones share an original request.
868 * Leave it to dm_end_request(), which handles this special
869 * case.
870 */
871 dm_end_request(clone, DM_ENDIO_REQUEUE);
872 return;
873 }
874
875 dm_unprep_request(rq); 808 dm_unprep_request(rq);
876 809
877 spin_lock_irqsave(q->queue_lock, flags); 810 spin_lock_irqsave(q->queue_lock, flags);
@@ -961,19 +894,6 @@ static void dm_complete_request(struct request *clone, int error)
961 struct dm_rq_target_io *tio = clone->end_io_data; 894 struct dm_rq_target_io *tio = clone->end_io_data;
962 struct request *rq = tio->orig; 895 struct request *rq = tio->orig;
963 896
964 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
965 /*
966 * Barrier clones share an original request. So can't use
967 * softirq_done with the original.
968 * Pass the clone to dm_done() directly in this special case.
969 * It is safe (even if clone->q->queue_lock is held here)
970 * because there is no I/O dispatching during the completion
971 * of barrier clone.
972 */
973 dm_done(clone, error, true);
974 return;
975 }
976
977 tio->error = error; 897 tio->error = error;
978 rq->completion_data = clone; 898 rq->completion_data = clone;
979 blk_complete_request(rq); 899 blk_complete_request(rq);
@@ -990,17 +910,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
990 struct dm_rq_target_io *tio = clone->end_io_data; 910 struct dm_rq_target_io *tio = clone->end_io_data;
991 struct request *rq = tio->orig; 911 struct request *rq = tio->orig;
992 912
993 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
994 /*
995 * Barrier clones share an original request.
996 * Leave it to dm_end_request(), which handles this special
997 * case.
998 */
999 BUG_ON(error > 0);
1000 dm_end_request(clone, error);
1001 return;
1002 }
1003
1004 rq->cmd_flags |= REQ_FAILED; 913 rq->cmd_flags |= REQ_FAILED;
1005 dm_complete_request(clone, error); 914 dm_complete_request(clone, error);
1006} 915}
@@ -1119,7 +1028,7 @@ static void dm_bio_destructor(struct bio *bio)
1119} 1028}
1120 1029
1121/* 1030/*
1122 * Creates a little bio that is just does part of a bvec. 1031 * Creates a little bio that just does part of a bvec.
1123 */ 1032 */
1124static struct bio *split_bvec(struct bio *bio, sector_t sector, 1033static struct bio *split_bvec(struct bio *bio, sector_t sector,
1125 unsigned short idx, unsigned int offset, 1034 unsigned short idx, unsigned int offset,
@@ -1134,7 +1043,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
1134 1043
1135 clone->bi_sector = sector; 1044 clone->bi_sector = sector;
1136 clone->bi_bdev = bio->bi_bdev; 1045 clone->bi_bdev = bio->bi_bdev;
1137 clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; 1046 clone->bi_rw = bio->bi_rw;
1138 clone->bi_vcnt = 1; 1047 clone->bi_vcnt = 1;
1139 clone->bi_size = to_bytes(len); 1048 clone->bi_size = to_bytes(len);
1140 clone->bi_io_vec->bv_offset = offset; 1049 clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1070,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
1161 1070
1162 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1071 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1163 __bio_clone(clone, bio); 1072 __bio_clone(clone, bio);
1164 clone->bi_rw &= ~REQ_HARDBARRIER;
1165 clone->bi_destructor = dm_bio_destructor; 1073 clone->bi_destructor = dm_bio_destructor;
1166 clone->bi_sector = sector; 1074 clone->bi_sector = sector;
1167 clone->bi_idx = idx; 1075 clone->bi_idx = idx;
@@ -1225,16 +1133,15 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1225 __issue_target_request(ci, ti, request_nr, len); 1133 __issue_target_request(ci, ti, request_nr, len);
1226} 1134}
1227 1135
1228static int __clone_and_map_empty_barrier(struct clone_info *ci) 1136static int __clone_and_map_empty_flush(struct clone_info *ci)
1229{ 1137{
1230 unsigned target_nr = 0; 1138 unsigned target_nr = 0;
1231 struct dm_target *ti; 1139 struct dm_target *ti;
1232 1140
1141 BUG_ON(bio_has_data(ci->bio));
1233 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1142 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1234 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1143 __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1235 1144
1236 ci->sector_count = 0;
1237
1238 return 0; 1145 return 0;
1239} 1146}
1240 1147
@@ -1289,9 +1196,6 @@ static int __clone_and_map(struct clone_info *ci)
1289 sector_t len = 0, max; 1196 sector_t len = 0, max;
1290 struct dm_target_io *tio; 1197 struct dm_target_io *tio;
1291 1198
1292 if (unlikely(bio_empty_barrier(bio)))
1293 return __clone_and_map_empty_barrier(ci);
1294
1295 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1199 if (unlikely(bio->bi_rw & REQ_DISCARD))
1296 return __clone_and_map_discard(ci); 1200 return __clone_and_map_discard(ci);
1297 1201
@@ -1383,16 +1287,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1383 1287
1384 ci.map = dm_get_live_table(md); 1288 ci.map = dm_get_live_table(md);
1385 if (unlikely(!ci.map)) { 1289 if (unlikely(!ci.map)) {
1386 if (!(bio->bi_rw & REQ_HARDBARRIER)) 1290 bio_io_error(bio);
1387 bio_io_error(bio);
1388 else
1389 if (!md->barrier_error)
1390 md->barrier_error = -EIO;
1391 return; 1291 return;
1392 } 1292 }
1393 1293
1394 ci.md = md; 1294 ci.md = md;
1395 ci.bio = bio;
1396 ci.io = alloc_io(md); 1295 ci.io = alloc_io(md);
1397 ci.io->error = 0; 1296 ci.io->error = 0;
1398 atomic_set(&ci.io->io_count, 1); 1297 atomic_set(&ci.io->io_count, 1);
@@ -1400,14 +1299,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1400 ci.io->md = md; 1299 ci.io->md = md;
1401 spin_lock_init(&ci.io->endio_lock); 1300 spin_lock_init(&ci.io->endio_lock);
1402 ci.sector = bio->bi_sector; 1301 ci.sector = bio->bi_sector;
1403 ci.sector_count = bio_sectors(bio);
1404 if (unlikely(bio_empty_barrier(bio)))
1405 ci.sector_count = 1;
1406 ci.idx = bio->bi_idx; 1302 ci.idx = bio->bi_idx;
1407 1303
1408 start_io_acct(ci.io); 1304 start_io_acct(ci.io);
1409 while (ci.sector_count && !error) 1305 if (bio->bi_rw & REQ_FLUSH) {
1410 error = __clone_and_map(&ci); 1306 ci.bio = &ci.md->flush_bio;
1307 ci.sector_count = 0;
1308 error = __clone_and_map_empty_flush(&ci);
1309 /* dec_pending submits any data associated with flush */
1310 } else {
1311 ci.bio = bio;
1312 ci.sector_count = bio_sectors(bio);
1313 while (ci.sector_count && !error)
1314 error = __clone_and_map(&ci);
1315 }
1411 1316
1412 /* drop the extra reference count */ 1317 /* drop the extra reference count */
1413 dec_pending(ci.io, error); 1318 dec_pending(ci.io, error);
@@ -1491,22 +1396,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
1491 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1396 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1492 part_stat_unlock(); 1397 part_stat_unlock();
1493 1398
1494 /* 1399 /* if we're suspended, we have to queue this io for later */
1495 * If we're suspended or the thread is processing barriers 1400 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1496 * we have to queue this io for later.
1497 */
1498 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1499 unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
1500 up_read(&md->io_lock); 1401 up_read(&md->io_lock);
1501 1402
1502 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1403 if (bio_rw(bio) != READA)
1503 bio_rw(bio) == READA) { 1404 queue_io(md, bio);
1405 else
1504 bio_io_error(bio); 1406 bio_io_error(bio);
1505 return 0;
1506 }
1507
1508 queue_io(md, bio);
1509
1510 return 0; 1407 return 0;
1511 } 1408 }
1512 1409
@@ -1537,14 +1434,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1537 return _dm_request(q, bio); 1434 return _dm_request(q, bio);
1538} 1435}
1539 1436
1540static bool dm_rq_is_flush_request(struct request *rq)
1541{
1542 if (rq->cmd_flags & REQ_FLUSH)
1543 return true;
1544 else
1545 return false;
1546}
1547
1548void dm_dispatch_request(struct request *rq) 1437void dm_dispatch_request(struct request *rq)
1549{ 1438{
1550 int r; 1439 int r;
@@ -1592,22 +1481,15 @@ static int setup_clone(struct request *clone, struct request *rq,
1592{ 1481{
1593 int r; 1482 int r;
1594 1483
1595 if (dm_rq_is_flush_request(rq)) { 1484 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1596 blk_rq_init(NULL, clone); 1485 dm_rq_bio_constructor, tio);
1597 clone->cmd_type = REQ_TYPE_FS; 1486 if (r)
1598 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); 1487 return r;
1599 } else {
1600 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1601 dm_rq_bio_constructor, tio);
1602 if (r)
1603 return r;
1604
1605 clone->cmd = rq->cmd;
1606 clone->cmd_len = rq->cmd_len;
1607 clone->sense = rq->sense;
1608 clone->buffer = rq->buffer;
1609 }
1610 1488
1489 clone->cmd = rq->cmd;
1490 clone->cmd_len = rq->cmd_len;
1491 clone->sense = rq->sense;
1492 clone->buffer = rq->buffer;
1611 clone->end_io = end_clone_request; 1493 clone->end_io = end_clone_request;
1612 clone->end_io_data = tio; 1494 clone->end_io_data = tio;
1613 1495
@@ -1648,9 +1530,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
1648 struct mapped_device *md = q->queuedata; 1530 struct mapped_device *md = q->queuedata;
1649 struct request *clone; 1531 struct request *clone;
1650 1532
1651 if (unlikely(dm_rq_is_flush_request(rq)))
1652 return BLKPREP_OK;
1653
1654 if (unlikely(rq->special)) { 1533 if (unlikely(rq->special)) {
1655 DMWARN("Already has something in rq->special."); 1534 DMWARN("Already has something in rq->special.");
1656 return BLKPREP_KILL; 1535 return BLKPREP_KILL;
@@ -1727,6 +1606,7 @@ static void dm_request_fn(struct request_queue *q)
1727 struct dm_table *map = dm_get_live_table(md); 1606 struct dm_table *map = dm_get_live_table(md);
1728 struct dm_target *ti; 1607 struct dm_target *ti;
1729 struct request *rq, *clone; 1608 struct request *rq, *clone;
1609 sector_t pos;
1730 1610
1731 /* 1611 /*
1732 * For suspend, check blk_queue_stopped() and increment 1612 * For suspend, check blk_queue_stopped() and increment
@@ -1739,15 +1619,14 @@ static void dm_request_fn(struct request_queue *q)
1739 if (!rq) 1619 if (!rq)
1740 goto plug_and_out; 1620 goto plug_and_out;
1741 1621
1742 if (unlikely(dm_rq_is_flush_request(rq))) { 1622 /* always use block 0 to find the target for flushes for now */
1743 BUG_ON(md->flush_request); 1623 pos = 0;
1744 md->flush_request = rq; 1624 if (!(rq->cmd_flags & REQ_FLUSH))
1745 blk_start_request(rq); 1625 pos = blk_rq_pos(rq);
1746 queue_work(md->wq, &md->barrier_work); 1626
1747 goto out; 1627 ti = dm_table_find_target(map, pos);
1748 } 1628 BUG_ON(!dm_target_is_valid(ti));
1749 1629
1750 ti = dm_table_find_target(map, blk_rq_pos(rq));
1751 if (ti->type->busy && ti->type->busy(ti)) 1630 if (ti->type->busy && ti->type->busy(ti))
1752 goto plug_and_out; 1631 goto plug_and_out;
1753 1632
@@ -1918,7 +1797,6 @@ out:
1918static const struct block_device_operations dm_blk_dops; 1797static const struct block_device_operations dm_blk_dops;
1919 1798
1920static void dm_wq_work(struct work_struct *work); 1799static void dm_wq_work(struct work_struct *work);
1921static void dm_rq_barrier_work(struct work_struct *work);
1922 1800
1923static void dm_init_md_queue(struct mapped_device *md) 1801static void dm_init_md_queue(struct mapped_device *md)
1924{ 1802{
@@ -1940,6 +1818,7 @@ static void dm_init_md_queue(struct mapped_device *md)
1940 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1818 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1941 md->queue->unplug_fn = dm_unplug_all; 1819 md->queue->unplug_fn = dm_unplug_all;
1942 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1820 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1821 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1943} 1822}
1944 1823
1945/* 1824/*
@@ -1972,7 +1851,6 @@ static struct mapped_device *alloc_dev(int minor)
1972 mutex_init(&md->suspend_lock); 1851 mutex_init(&md->suspend_lock);
1973 mutex_init(&md->type_lock); 1852 mutex_init(&md->type_lock);
1974 spin_lock_init(&md->deferred_lock); 1853 spin_lock_init(&md->deferred_lock);
1975 spin_lock_init(&md->barrier_error_lock);
1976 rwlock_init(&md->map_lock); 1854 rwlock_init(&md->map_lock);
1977 atomic_set(&md->holders, 1); 1855 atomic_set(&md->holders, 1);
1978 atomic_set(&md->open_count, 0); 1856 atomic_set(&md->open_count, 0);
@@ -1995,7 +1873,6 @@ static struct mapped_device *alloc_dev(int minor)
1995 atomic_set(&md->pending[1], 0); 1873 atomic_set(&md->pending[1], 0);
1996 init_waitqueue_head(&md->wait); 1874 init_waitqueue_head(&md->wait);
1997 INIT_WORK(&md->work, dm_wq_work); 1875 INIT_WORK(&md->work, dm_wq_work);
1998 INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1999 init_waitqueue_head(&md->eventq); 1876 init_waitqueue_head(&md->eventq);
2000 1877
2001 md->disk->major = _major; 1878 md->disk->major = _major;
@@ -2015,6 +1892,10 @@ static struct mapped_device *alloc_dev(int minor)
2015 if (!md->bdev) 1892 if (!md->bdev)
2016 goto bad_bdev; 1893 goto bad_bdev;
2017 1894
1895 bio_init(&md->flush_bio);
1896 md->flush_bio.bi_bdev = md->bdev;
1897 md->flush_bio.bi_rw = WRITE_FLUSH;
1898
2018 /* Populate the mapping, nobody knows we exist yet */ 1899 /* Populate the mapping, nobody knows we exist yet */
2019 spin_lock(&_minor_lock); 1900 spin_lock(&_minor_lock);
2020 old_md = idr_replace(&_minor_idr, md, minor); 1901 old_md = idr_replace(&_minor_idr, md, minor);
@@ -2245,7 +2126,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
2245 blk_queue_softirq_done(md->queue, dm_softirq_done); 2126 blk_queue_softirq_done(md->queue, dm_softirq_done);
2246 blk_queue_prep_rq(md->queue, dm_prep_fn); 2127 blk_queue_prep_rq(md->queue, dm_prep_fn);
2247 blk_queue_lld_busy(md->queue, dm_lld_busy); 2128 blk_queue_lld_busy(md->queue, dm_lld_busy);
2248 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
2249 2129
2250 elv_register_queue(md->queue); 2130 elv_register_queue(md->queue);
2251 2131
@@ -2406,43 +2286,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2406 return r; 2286 return r;
2407} 2287}
2408 2288
2409static void dm_flush(struct mapped_device *md)
2410{
2411 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2412
2413 bio_init(&md->barrier_bio);
2414 md->barrier_bio.bi_bdev = md->bdev;
2415 md->barrier_bio.bi_rw = WRITE_BARRIER;
2416 __split_and_process_bio(md, &md->barrier_bio);
2417
2418 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2419}
2420
2421static void process_barrier(struct mapped_device *md, struct bio *bio)
2422{
2423 md->barrier_error = 0;
2424
2425 dm_flush(md);
2426
2427 if (!bio_empty_barrier(bio)) {
2428 __split_and_process_bio(md, bio);
2429 /*
2430 * If the request isn't supported, don't waste time with
2431 * the second flush.
2432 */
2433 if (md->barrier_error != -EOPNOTSUPP)
2434 dm_flush(md);
2435 }
2436
2437 if (md->barrier_error != DM_ENDIO_REQUEUE)
2438 bio_endio(bio, md->barrier_error);
2439 else {
2440 spin_lock_irq(&md->deferred_lock);
2441 bio_list_add_head(&md->deferred, bio);
2442 spin_unlock_irq(&md->deferred_lock);
2443 }
2444}
2445
2446/* 2289/*
2447 * Process the deferred bios 2290 * Process the deferred bios
2448 */ 2291 */
@@ -2452,33 +2295,27 @@ static void dm_wq_work(struct work_struct *work)
2452 work); 2295 work);
2453 struct bio *c; 2296 struct bio *c;
2454 2297
2455 down_write(&md->io_lock); 2298 down_read(&md->io_lock);
2456 2299
2457 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2300 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2458 spin_lock_irq(&md->deferred_lock); 2301 spin_lock_irq(&md->deferred_lock);
2459 c = bio_list_pop(&md->deferred); 2302 c = bio_list_pop(&md->deferred);
2460 spin_unlock_irq(&md->deferred_lock); 2303 spin_unlock_irq(&md->deferred_lock);
2461 2304
2462 if (!c) { 2305 if (!c)
2463 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2464 break; 2306 break;
2465 }
2466 2307
2467 up_write(&md->io_lock); 2308 up_read(&md->io_lock);
2468 2309
2469 if (dm_request_based(md)) 2310 if (dm_request_based(md))
2470 generic_make_request(c); 2311 generic_make_request(c);
2471 else { 2312 else
2472 if (c->bi_rw & REQ_HARDBARRIER) 2313 __split_and_process_bio(md, c);
2473 process_barrier(md, c);
2474 else
2475 __split_and_process_bio(md, c);
2476 }
2477 2314
2478 down_write(&md->io_lock); 2315 down_read(&md->io_lock);
2479 } 2316 }
2480 2317
2481 up_write(&md->io_lock); 2318 up_read(&md->io_lock);
2482} 2319}
2483 2320
2484static void dm_queue_flush(struct mapped_device *md) 2321static void dm_queue_flush(struct mapped_device *md)
@@ -2488,73 +2325,6 @@ static void dm_queue_flush(struct mapped_device *md)
2488 queue_work(md->wq, &md->work); 2325 queue_work(md->wq, &md->work);
2489} 2326}
2490 2327
2491static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
2492{
2493 struct dm_rq_target_io *tio = clone->end_io_data;
2494
2495 tio->info.target_request_nr = request_nr;
2496}
2497
2498/* Issue barrier requests to targets and wait for their completion. */
2499static int dm_rq_barrier(struct mapped_device *md)
2500{
2501 int i, j;
2502 struct dm_table *map = dm_get_live_table(md);
2503 unsigned num_targets = dm_table_get_num_targets(map);
2504 struct dm_target *ti;
2505 struct request *clone;
2506
2507 md->barrier_error = 0;
2508
2509 for (i = 0; i < num_targets; i++) {
2510 ti = dm_table_get_target(map, i);
2511 for (j = 0; j < ti->num_flush_requests; j++) {
2512 clone = clone_rq(md->flush_request, md, GFP_NOIO);
2513 dm_rq_set_target_request_nr(clone, j);
2514 atomic_inc(&md->pending[rq_data_dir(clone)]);
2515 map_request(ti, clone, md);
2516 }
2517 }
2518
2519 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2520 dm_table_put(map);
2521
2522 return md->barrier_error;
2523}
2524
2525static void dm_rq_barrier_work(struct work_struct *work)
2526{
2527 int error;
2528 struct mapped_device *md = container_of(work, struct mapped_device,
2529 barrier_work);
2530 struct request_queue *q = md->queue;
2531 struct request *rq;
2532 unsigned long flags;
2533
2534 /*
2535 * Hold the md reference here and leave it at the last part so that
2536 * the md can't be deleted by device opener when the barrier request
2537 * completes.
2538 */
2539 dm_get(md);
2540
2541 error = dm_rq_barrier(md);
2542
2543 rq = md->flush_request;
2544 md->flush_request = NULL;
2545
2546 if (error == DM_ENDIO_REQUEUE) {
2547 spin_lock_irqsave(q->queue_lock, flags);
2548 blk_requeue_request(q, rq);
2549 spin_unlock_irqrestore(q->queue_lock, flags);
2550 } else
2551 blk_end_request_all(rq, error);
2552
2553 blk_run_queue(q);
2554
2555 dm_put(md);
2556}
2557
2558/* 2328/*
2559 * Swap in a new table, returning the old one for the caller to destroy. 2329 * Swap in a new table, returning the old one for the caller to destroy.
2560 */ 2330 */
@@ -2677,23 +2447,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2677 * 2447 *
2678 * To get all processes out of __split_and_process_bio in dm_request, 2448 * To get all processes out of __split_and_process_bio in dm_request,
2679 * we take the write lock. To prevent any process from reentering 2449 * we take the write lock. To prevent any process from reentering
2680 * __split_and_process_bio from dm_request, we set 2450 * __split_and_process_bio from dm_request and quiesce the thread
2681 * DMF_QUEUE_IO_TO_THREAD. 2451 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2682 * 2452 * flush_workqueue(md->wq).
2683 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
2684 * and call flush_workqueue(md->wq). flush_workqueue will wait until
2685 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
2686 * further calls to __split_and_process_bio from dm_wq_work.
2687 */ 2453 */
2688 down_write(&md->io_lock); 2454 down_write(&md->io_lock);
2689 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2455 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2690 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2691 up_write(&md->io_lock); 2456 up_write(&md->io_lock);
2692 2457
2693 /* 2458 /*
2694 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which 2459 * Stop md->queue before flushing md->wq in case request-based
2695 * can be kicked until md->queue is stopped. So stop md->queue before 2460 * dm defers requests to md->wq from md->queue.
2696 * flushing md->wq.
2697 */ 2461 */
2698 if (dm_request_based(md)) 2462 if (dm_request_based(md))
2699 stop_queue(md->queue); 2463 stop_queue(md->queue);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ba19060bcf3f..8a2f767f26d8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
294 dev_info_t *tmp_dev; 294 dev_info_t *tmp_dev;
295 sector_t start_sector; 295 sector_t start_sector;
296 296
297 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 297 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
298 md_barrier_request(mddev, bio); 298 md_flush_request(mddev, bio);
299 return 0; 299 return 0;
300 } 300 }
301 301
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dbf822df942a..225815197a3d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -227,12 +227,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
227 return 0; 227 return 0;
228 } 228 }
229 rcu_read_lock(); 229 rcu_read_lock();
230 if (mddev->suspended || mddev->barrier) { 230 if (mddev->suspended) {
231 DEFINE_WAIT(__wait); 231 DEFINE_WAIT(__wait);
232 for (;;) { 232 for (;;) {
233 prepare_to_wait(&mddev->sb_wait, &__wait, 233 prepare_to_wait(&mddev->sb_wait, &__wait,
234 TASK_UNINTERRUPTIBLE); 234 TASK_UNINTERRUPTIBLE);
235 if (!mddev->suspended && !mddev->barrier) 235 if (!mddev->suspended)
236 break; 236 break;
237 rcu_read_unlock(); 237 rcu_read_unlock();
238 schedule(); 238 schedule();
@@ -283,40 +283,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);
283 283
284int mddev_congested(mddev_t *mddev, int bits) 284int mddev_congested(mddev_t *mddev, int bits)
285{ 285{
286 if (mddev->barrier)
287 return 1;
288 return mddev->suspended; 286 return mddev->suspended;
289} 287}
290EXPORT_SYMBOL(mddev_congested); 288EXPORT_SYMBOL(mddev_congested);
291 289
292/* 290/*
293 * Generic barrier handling for md 291 * Generic flush handling for md
294 */ 292 */
295 293
296#define POST_REQUEST_BARRIER ((void*)1) 294static void md_end_flush(struct bio *bio, int err)
297
298static void md_end_barrier(struct bio *bio, int err)
299{ 295{
300 mdk_rdev_t *rdev = bio->bi_private; 296 mdk_rdev_t *rdev = bio->bi_private;
301 mddev_t *mddev = rdev->mddev; 297 mddev_t *mddev = rdev->mddev;
302 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
303 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
304 298
305 rdev_dec_pending(rdev, mddev); 299 rdev_dec_pending(rdev, mddev);
306 300
307 if (atomic_dec_and_test(&mddev->flush_pending)) { 301 if (atomic_dec_and_test(&mddev->flush_pending)) {
308 if (mddev->barrier == POST_REQUEST_BARRIER) { 302 /* The pre-request flush has finished */
309 /* This was a post-request barrier */ 303 schedule_work(&mddev->flush_work);
310 mddev->barrier = NULL;
311 wake_up(&mddev->sb_wait);
312 } else
313 /* The pre-request barrier has finished */
314 schedule_work(&mddev->barrier_work);
315 } 304 }
316 bio_put(bio); 305 bio_put(bio);
317} 306}
318 307
319static void submit_barriers(mddev_t *mddev) 308static void submit_flushes(mddev_t *mddev)
320{ 309{
321 mdk_rdev_t *rdev; 310 mdk_rdev_t *rdev;
322 311
@@ -333,60 +322,56 @@ static void submit_barriers(mddev_t *mddev)
333 atomic_inc(&rdev->nr_pending); 322 atomic_inc(&rdev->nr_pending);
334 rcu_read_unlock(); 323 rcu_read_unlock();
335 bi = bio_alloc(GFP_KERNEL, 0); 324 bi = bio_alloc(GFP_KERNEL, 0);
336 bi->bi_end_io = md_end_barrier; 325 bi->bi_end_io = md_end_flush;
337 bi->bi_private = rdev; 326 bi->bi_private = rdev;
338 bi->bi_bdev = rdev->bdev; 327 bi->bi_bdev = rdev->bdev;
339 atomic_inc(&mddev->flush_pending); 328 atomic_inc(&mddev->flush_pending);
340 submit_bio(WRITE_BARRIER, bi); 329 submit_bio(WRITE_FLUSH, bi);
341 rcu_read_lock(); 330 rcu_read_lock();
342 rdev_dec_pending(rdev, mddev); 331 rdev_dec_pending(rdev, mddev);
343 } 332 }
344 rcu_read_unlock(); 333 rcu_read_unlock();
345} 334}
346 335
347static void md_submit_barrier(struct work_struct *ws) 336static void md_submit_flush_data(struct work_struct *ws)
348{ 337{
349 mddev_t *mddev = container_of(ws, mddev_t, barrier_work); 338 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
350 struct bio *bio = mddev->barrier; 339 struct bio *bio = mddev->flush_bio;
351 340
352 atomic_set(&mddev->flush_pending, 1); 341 atomic_set(&mddev->flush_pending, 1);
353 342
354 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 343 if (bio->bi_size == 0)
355 bio_endio(bio, -EOPNOTSUPP);
356 else if (bio->bi_size == 0)
357 /* an empty barrier - all done */ 344 /* an empty barrier - all done */
358 bio_endio(bio, 0); 345 bio_endio(bio, 0);
359 else { 346 else {
360 bio->bi_rw &= ~REQ_HARDBARRIER; 347 bio->bi_rw &= ~REQ_FLUSH;
361 if (mddev->pers->make_request(mddev, bio)) 348 if (mddev->pers->make_request(mddev, bio))
362 generic_make_request(bio); 349 generic_make_request(bio);
363 mddev->barrier = POST_REQUEST_BARRIER;
364 submit_barriers(mddev);
365 } 350 }
366 if (atomic_dec_and_test(&mddev->flush_pending)) { 351 if (atomic_dec_and_test(&mddev->flush_pending)) {
367 mddev->barrier = NULL; 352 mddev->flush_bio = NULL;
368 wake_up(&mddev->sb_wait); 353 wake_up(&mddev->sb_wait);
369 } 354 }
370} 355}
371 356
372void md_barrier_request(mddev_t *mddev, struct bio *bio) 357void md_flush_request(mddev_t *mddev, struct bio *bio)
373{ 358{
374 spin_lock_irq(&mddev->write_lock); 359 spin_lock_irq(&mddev->write_lock);
375 wait_event_lock_irq(mddev->sb_wait, 360 wait_event_lock_irq(mddev->sb_wait,
376 !mddev->barrier, 361 !mddev->flush_bio,
377 mddev->write_lock, /*nothing*/); 362 mddev->write_lock, /*nothing*/);
378 mddev->barrier = bio; 363 mddev->flush_bio = bio;
379 spin_unlock_irq(&mddev->write_lock); 364 spin_unlock_irq(&mddev->write_lock);
380 365
381 atomic_set(&mddev->flush_pending, 1); 366 atomic_set(&mddev->flush_pending, 1);
382 INIT_WORK(&mddev->barrier_work, md_submit_barrier); 367 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
383 368
384 submit_barriers(mddev); 369 submit_flushes(mddev);
385 370
386 if (atomic_dec_and_test(&mddev->flush_pending)) 371 if (atomic_dec_and_test(&mddev->flush_pending))
387 schedule_work(&mddev->barrier_work); 372 schedule_work(&mddev->flush_work);
388} 373}
389EXPORT_SYMBOL(md_barrier_request); 374EXPORT_SYMBOL(md_flush_request);
390 375
391/* Support for plugging. 376/* Support for plugging.
392 * This mirrors the plugging support in request_queue, but does not 377 * This mirrors the plugging support in request_queue, but does not
@@ -697,31 +682,6 @@ static void super_written(struct bio *bio, int error)
697 bio_put(bio); 682 bio_put(bio);
698} 683}
699 684
700static void super_written_barrier(struct bio *bio, int error)
701{
702 struct bio *bio2 = bio->bi_private;
703 mdk_rdev_t *rdev = bio2->bi_private;
704 mddev_t *mddev = rdev->mddev;
705
706 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
707 error == -EOPNOTSUPP) {
708 unsigned long flags;
709 /* barriers don't appear to be supported :-( */
710 set_bit(BarriersNotsupp, &rdev->flags);
711 mddev->barriers_work = 0;
712 spin_lock_irqsave(&mddev->write_lock, flags);
713 bio2->bi_next = mddev->biolist;
714 mddev->biolist = bio2;
715 spin_unlock_irqrestore(&mddev->write_lock, flags);
716 wake_up(&mddev->sb_wait);
717 bio_put(bio);
718 } else {
719 bio_put(bio2);
720 bio->bi_private = rdev;
721 super_written(bio, error);
722 }
723}
724
725void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 685void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
726 sector_t sector, int size, struct page *page) 686 sector_t sector, int size, struct page *page)
727{ 687{
@@ -730,51 +690,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
730 * and decrement it on completion, waking up sb_wait 690 * and decrement it on completion, waking up sb_wait
731 * if zero is reached. 691 * if zero is reached.
732 * If an error occurred, call md_error 692 * If an error occurred, call md_error
733 *
734 * As we might need to resubmit the request if REQ_HARDBARRIER
735 * causes ENOTSUPP, we allocate a spare bio...
736 */ 693 */
737 struct bio *bio = bio_alloc(GFP_NOIO, 1); 694 struct bio *bio = bio_alloc(GFP_NOIO, 1);
738 int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
739 695
740 bio->bi_bdev = rdev->bdev; 696 bio->bi_bdev = rdev->bdev;
741 bio->bi_sector = sector; 697 bio->bi_sector = sector;
742 bio_add_page(bio, page, size, 0); 698 bio_add_page(bio, page, size, 0);
743 bio->bi_private = rdev; 699 bio->bi_private = rdev;
744 bio->bi_end_io = super_written; 700 bio->bi_end_io = super_written;
745 bio->bi_rw = rw;
746 701
747 atomic_inc(&mddev->pending_writes); 702 atomic_inc(&mddev->pending_writes);
748 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 703 submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
749 struct bio *rbio; 704 bio);
750 rw |= REQ_HARDBARRIER;
751 rbio = bio_clone(bio, GFP_NOIO);
752 rbio->bi_private = bio;
753 rbio->bi_end_io = super_written_barrier;
754 submit_bio(rw, rbio);
755 } else
756 submit_bio(rw, bio);
757} 705}
758 706
759void md_super_wait(mddev_t *mddev) 707void md_super_wait(mddev_t *mddev)
760{ 708{
761 /* wait for all superblock writes that were scheduled to complete. 709 /* wait for all superblock writes that were scheduled to complete */
762 * if any had to be retried (due to BARRIER problems), retry them
763 */
764 DEFINE_WAIT(wq); 710 DEFINE_WAIT(wq);
765 for(;;) { 711 for(;;) {
766 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 712 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
767 if (atomic_read(&mddev->pending_writes)==0) 713 if (atomic_read(&mddev->pending_writes)==0)
768 break; 714 break;
769 while (mddev->biolist) {
770 struct bio *bio;
771 spin_lock_irq(&mddev->write_lock);
772 bio = mddev->biolist;
773 mddev->biolist = bio->bi_next ;
774 bio->bi_next = NULL;
775 spin_unlock_irq(&mddev->write_lock);
776 submit_bio(bio->bi_rw, bio);
777 }
778 schedule(); 715 schedule();
779 } 716 }
780 finish_wait(&mddev->sb_wait, &wq); 717 finish_wait(&mddev->sb_wait, &wq);
@@ -1071,7 +1008,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1071 clear_bit(Faulty, &rdev->flags); 1008 clear_bit(Faulty, &rdev->flags);
1072 clear_bit(In_sync, &rdev->flags); 1009 clear_bit(In_sync, &rdev->flags);
1073 clear_bit(WriteMostly, &rdev->flags); 1010 clear_bit(WriteMostly, &rdev->flags);
1074 clear_bit(BarriersNotsupp, &rdev->flags);
1075 1011
1076 if (mddev->raid_disks == 0) { 1012 if (mddev->raid_disks == 0) {
1077 mddev->major_version = 0; 1013 mddev->major_version = 0;
@@ -1486,7 +1422,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1486 clear_bit(Faulty, &rdev->flags); 1422 clear_bit(Faulty, &rdev->flags);
1487 clear_bit(In_sync, &rdev->flags); 1423 clear_bit(In_sync, &rdev->flags);
1488 clear_bit(WriteMostly, &rdev->flags); 1424 clear_bit(WriteMostly, &rdev->flags);
1489 clear_bit(BarriersNotsupp, &rdev->flags);
1490 1425
1491 if (mddev->raid_disks == 0) { 1426 if (mddev->raid_disks == 0) {
1492 mddev->major_version = 1; 1427 mddev->major_version = 1;
@@ -4505,7 +4440,6 @@ int md_run(mddev_t *mddev)
4505 /* may be over-ridden by personality */ 4440 /* may be over-ridden by personality */
4506 mddev->resync_max_sectors = mddev->dev_sectors; 4441 mddev->resync_max_sectors = mddev->dev_sectors;
4507 4442
4508 mddev->barriers_work = 1;
4509 mddev->ok_start_degraded = start_dirty_degraded; 4443 mddev->ok_start_degraded = start_dirty_degraded;
4510 4444
4511 if (start_readonly && mddev->ro == 0) 4445 if (start_readonly && mddev->ro == 0)
@@ -4684,7 +4618,6 @@ static void md_clean(mddev_t *mddev)
4684 mddev->recovery = 0; 4618 mddev->recovery = 0;
4685 mddev->in_sync = 0; 4619 mddev->in_sync = 0;
4686 mddev->degraded = 0; 4620 mddev->degraded = 0;
4687 mddev->barriers_work = 0;
4688 mddev->safemode = 0; 4621 mddev->safemode = 0;
4689 mddev->bitmap_info.offset = 0; 4622 mddev->bitmap_info.offset = 0;
4690 mddev->bitmap_info.default_offset = 0; 4623 mddev->bitmap_info.default_offset = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 3931299788dc..112a2c32db0c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -87,7 +87,6 @@ struct mdk_rdev_s
87#define Faulty 1 /* device is known to have a fault */ 87#define Faulty 1 /* device is known to have a fault */
88#define In_sync 2 /* device is in_sync with rest of array */ 88#define In_sync 2 /* device is in_sync with rest of array */
89#define WriteMostly 4 /* Avoid reading if at all possible */ 89#define WriteMostly 4 /* Avoid reading if at all possible */
90#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */
91#define AllReserved 6 /* If whole device is reserved for 90#define AllReserved 6 /* If whole device is reserved for
92 * one array */ 91 * one array */
93#define AutoDetected 7 /* added by auto-detect */ 92#define AutoDetected 7 /* added by auto-detect */
@@ -273,13 +272,6 @@ struct mddev_s
273 int degraded; /* whether md should consider 272 int degraded; /* whether md should consider
274 * adding a spare 273 * adding a spare
275 */ 274 */
276 int barriers_work; /* initialised to true, cleared as soon
277 * as a barrier request to slave
278 * fails. Only supported
279 */
280 struct bio *biolist; /* bios that need to be retried
281 * because REQ_HARDBARRIER is not supported
282 */
283 275
284 atomic_t recovery_active; /* blocks scheduled, but not written */ 276 atomic_t recovery_active; /* blocks scheduled, but not written */
285 wait_queue_head_t recovery_wait; 277 wait_queue_head_t recovery_wait;
@@ -339,16 +331,13 @@ struct mddev_s
339 struct attribute_group *to_remove; 331 struct attribute_group *to_remove;
340 struct plug_handle *plug; /* if used by personality */ 332 struct plug_handle *plug; /* if used by personality */
341 333
342 /* Generic barrier handling. 334 /* Generic flush handling.
343 * If there is a pending barrier request, all other 335 * The last to finish preflush schedules a worker to submit
344 * writes are blocked while the devices are flushed. 336 * the rest of the request (without the REQ_FLUSH flag).
345 * The last to finish a flush schedules a worker to
346 * submit the barrier request (without the barrier flag),
347 * then submit more flush requests.
348 */ 337 */
349 struct bio *barrier; 338 struct bio *flush_bio;
350 atomic_t flush_pending; 339 atomic_t flush_pending;
351 struct work_struct barrier_work; 340 struct work_struct flush_work;
352 struct work_struct event_work; /* used by dm to report failure event */ 341 struct work_struct event_work; /* used by dm to report failure event */
353}; 342};
354 343
@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
502extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 491extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
503 492
504extern int mddev_congested(mddev_t *mddev, int bits); 493extern int mddev_congested(mddev_t *mddev, int bits);
505extern void md_barrier_request(mddev_t *mddev, struct bio *bio); 494extern void md_flush_request(mddev_t *mddev, struct bio *bio);
506extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 495extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
507 sector_t sector, int size, struct page *page); 496 sector_t sector, int size, struct page *page);
508extern void md_super_wait(mddev_t *mddev); 497extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0307d217e7a4..6d7ddf32ef2e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
142 struct multipath_bh * mp_bh; 142 struct multipath_bh * mp_bh;
143 struct multipath_info *multipath; 143 struct multipath_info *multipath;
144 144
145 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 145 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
146 md_barrier_request(mddev, bio); 146 md_flush_request(mddev, bio);
147 return 0; 147 return 0;
148 } 148 }
149 149
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f7af46d623c..a39f4c355e55 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
483 struct strip_zone *zone; 483 struct strip_zone *zone;
484 mdk_rdev_t *tmp_dev; 484 mdk_rdev_t *tmp_dev;
485 485
486 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 486 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
487 md_barrier_request(mddev, bio); 487 md_flush_request(mddev, bio);
488 return 0; 488 return 0;
489 } 489 }
490 490
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0b830bbe1d8b..378a25894c57 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
319 if (r1_bio->bios[mirror] == bio) 319 if (r1_bio->bios[mirror] == bio)
320 break; 320 break;
321 321
322 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { 322 /*
323 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); 323 * 'one mirror IO has finished' event handler:
324 set_bit(R1BIO_BarrierRetry, &r1_bio->state); 324 */
325 r1_bio->mddev->barriers_work = 0; 325 r1_bio->bios[mirror] = NULL;
326 /* Don't rdev_dec_pending in this branch - keep it for the retry */ 326 to_put = bio;
327 } else { 327 if (!uptodate) {
328 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
329 /* an I/O failed, we can't clear the bitmap */
330 set_bit(R1BIO_Degraded, &r1_bio->state);
331 } else
328 /* 332 /*
329 * this branch is our 'one mirror IO has finished' event handler: 333 * Set R1BIO_Uptodate in our master bio, so that we
334 * will return a good error code for to the higher
335 * levels even if IO on some other mirrored buffer
336 * fails.
337 *
338 * The 'master' represents the composite IO operation
339 * to user-side. So if something waits for IO, then it
340 * will wait for the 'master' bio.
330 */ 341 */
331 r1_bio->bios[mirror] = NULL; 342 set_bit(R1BIO_Uptodate, &r1_bio->state);
332 to_put = bio; 343
333 if (!uptodate) { 344 update_head_pos(mirror, r1_bio);
334 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 345
335 /* an I/O failed, we can't clear the bitmap */ 346 if (behind) {
336 set_bit(R1BIO_Degraded, &r1_bio->state); 347 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
337 } else 348 atomic_dec(&r1_bio->behind_remaining);
338 /* 349
339 * Set R1BIO_Uptodate in our master bio, so that 350 /*
340 * we will return a good error code for to the higher 351 * In behind mode, we ACK the master bio once the I/O
341 * levels even if IO on some other mirrored buffer fails. 352 * has safely reached all non-writemostly
342 * 353 * disks. Setting the Returned bit ensures that this
343 * The 'master' represents the composite IO operation to 354 * gets done only once -- we don't ever want to return
344 * user-side. So if something waits for IO, then it will 355 * -EIO here, instead we'll wait
345 * wait for the 'master' bio. 356 */
346 */ 357 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
347 set_bit(R1BIO_Uptodate, &r1_bio->state); 358 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
348 359 /* Maybe we can return now */
349 update_head_pos(mirror, r1_bio); 360 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
350 361 struct bio *mbio = r1_bio->master_bio;
351 if (behind) { 362 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
352 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 363 (unsigned long long) mbio->bi_sector,
353 atomic_dec(&r1_bio->behind_remaining); 364 (unsigned long long) mbio->bi_sector +
354 365 (mbio->bi_size >> 9) - 1);
355 /* In behind mode, we ACK the master bio once the I/O has safely 366 bio_endio(mbio, 0);
356 * reached all non-writemostly disks. Setting the Returned bit
357 * ensures that this gets done only once -- we don't ever want to
358 * return -EIO here, instead we'll wait */
359
360 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
361 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
362 /* Maybe we can return now */
363 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
364 struct bio *mbio = r1_bio->master_bio;
365 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
366 (unsigned long long) mbio->bi_sector,
367 (unsigned long long) mbio->bi_sector +
368 (mbio->bi_size >> 9) - 1);
369 bio_endio(mbio, 0);
370 }
371 } 367 }
372 } 368 }
373 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
374 } 369 }
370 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
371
375 /* 372 /*
376 *
377 * Let's see if all mirrored write operations have finished 373 * Let's see if all mirrored write operations have finished
378 * already. 374 * already.
379 */ 375 */
380 if (atomic_dec_and_test(&r1_bio->remaining)) { 376 if (atomic_dec_and_test(&r1_bio->remaining)) {
381 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) 377 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
382 reschedule_retry(r1_bio); 378 /* free extra copy of the data pages */
383 else { 379 int i = bio->bi_vcnt;
384 /* it really is the end of this request */ 380 while (i--)
385 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 381 safe_put_page(bio->bi_io_vec[i].bv_page);
386 /* free extra copy of the data pages */
387 int i = bio->bi_vcnt;
388 while (i--)
389 safe_put_page(bio->bi_io_vec[i].bv_page);
390 }
391 /* clear the bitmap if all writes complete successfully */
392 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
393 r1_bio->sectors,
394 !test_bit(R1BIO_Degraded, &r1_bio->state),
395 behind);
396 md_write_end(r1_bio->mddev);
397 raid_end_bio_io(r1_bio);
398 } 382 }
383 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
385 r1_bio->sectors,
386 !test_bit(R1BIO_Degraded, &r1_bio->state),
387 behind);
388 md_write_end(r1_bio->mddev);
389 raid_end_bio_io(r1_bio);
399 } 390 }
400 391
401 if (to_put) 392 if (to_put)
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
788 struct page **behind_pages = NULL; 779 struct page **behind_pages = NULL;
789 const int rw = bio_data_dir(bio); 780 const int rw = bio_data_dir(bio);
790 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 781 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
791 unsigned long do_barriers; 782 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
792 mdk_rdev_t *blocked_rdev; 783 mdk_rdev_t *blocked_rdev;
793 784
794 /* 785 /*
795 * Register the new request and wait if the reconstruction 786 * Register the new request and wait if the reconstruction
796 * thread has put up a bar for new requests. 787 * thread has put up a bar for new requests.
797 * Continue immediately if no resync is active currently. 788 * Continue immediately if no resync is active currently.
798 * We test barriers_work *after* md_write_start as md_write_start
799 * may cause the first superblock write, and that will check out
800 * if barriers work.
801 */ 789 */
802 790
803 md_write_start(mddev, bio); /* wait on superblock update early */ 791 md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 } 809 }
822 finish_wait(&conf->wait_barrier, &w); 810 finish_wait(&conf->wait_barrier, &w);
823 } 811 }
824 if (unlikely(!mddev->barriers_work &&
825 (bio->bi_rw & REQ_HARDBARRIER))) {
826 if (rw == WRITE)
827 md_write_end(mddev);
828 bio_endio(bio, -EOPNOTSUPP);
829 return 0;
830 }
831 812
832 wait_barrier(conf); 813 wait_barrier(conf);
833 814
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
959 atomic_set(&r1_bio->remaining, 0); 940 atomic_set(&r1_bio->remaining, 0);
960 atomic_set(&r1_bio->behind_remaining, 0); 941 atomic_set(&r1_bio->behind_remaining, 0);
961 942
962 do_barriers = bio->bi_rw & REQ_HARDBARRIER;
963 if (do_barriers)
964 set_bit(R1BIO_Barrier, &r1_bio->state);
965
966 bio_list_init(&bl); 943 bio_list_init(&bl);
967 for (i = 0; i < disks; i++) { 944 for (i = 0; i < disks; i++) {
968 struct bio *mbio; 945 struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
975 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 952 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
976 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 953 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
977 mbio->bi_end_io = raid1_end_write_request; 954 mbio->bi_end_io = raid1_end_write_request;
978 mbio->bi_rw = WRITE | do_barriers | do_sync; 955 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
979 mbio->bi_private = r1_bio; 956 mbio->bi_private = r1_bio;
980 957
981 if (behind_pages) { 958 if (behind_pages) {
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
1634 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1611 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1635 sync_request_write(mddev, r1_bio); 1612 sync_request_write(mddev, r1_bio);
1636 unplug = 1; 1613 unplug = 1;
1637 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1638 /* some requests in the r1bio were REQ_HARDBARRIER
1639 * requests which failed with -EOPNOTSUPP. Hohumm..
1640 * Better resubmit without the barrier.
1641 * We know which devices to resubmit for, because
1642 * all others have had their bios[] entry cleared.
1643 * We already have a nr_pending reference on these rdevs.
1644 */
1645 int i;
1646 const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
1647 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1648 clear_bit(R1BIO_Barrier, &r1_bio->state);
1649 for (i=0; i < conf->raid_disks; i++)
1650 if (r1_bio->bios[i])
1651 atomic_inc(&r1_bio->remaining);
1652 for (i=0; i < conf->raid_disks; i++)
1653 if (r1_bio->bios[i]) {
1654 struct bio_vec *bvec;
1655 int j;
1656
1657 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1658 /* copy pages from the failed bio, as
1659 * this might be a write-behind device */
1660 __bio_for_each_segment(bvec, bio, j, 0)
1661 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1662 bio_put(r1_bio->bios[i]);
1663 bio->bi_sector = r1_bio->sector +
1664 conf->mirrors[i].rdev->data_offset;
1665 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1666 bio->bi_end_io = raid1_end_write_request;
1667 bio->bi_rw = WRITE | do_sync;
1668 bio->bi_private = r1_bio;
1669 r1_bio->bios[i] = bio;
1670 generic_make_request(bio);
1671 }
1672 } else { 1614 } else {
1673 int disk; 1615 int disk;
1674 1616
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5f2d443ae28a..adf8cfd73313 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -117,8 +117,6 @@ struct r1bio_s {
117#define R1BIO_IsSync 1 117#define R1BIO_IsSync 1
118#define R1BIO_Degraded 2 118#define R1BIO_Degraded 2
119#define R1BIO_BehindIO 3 119#define R1BIO_BehindIO 3
120#define R1BIO_Barrier 4
121#define R1BIO_BarrierRetry 5
122/* For write-behind requests, we call bi_end_io when 120/* For write-behind requests, we call bi_end_io when
123 * the last non-write-behind device completes, providing 121 * the last non-write-behind device completes, providing
124 * any write was successful. Otherwise we call when 122 * any write was successful. Otherwise we call when
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124d..f0d082f749be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -800,12 +800,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
800 int chunk_sects = conf->chunk_mask + 1; 800 int chunk_sects = conf->chunk_mask + 1;
801 const int rw = bio_data_dir(bio); 801 const int rw = bio_data_dir(bio);
802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
803 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
803 struct bio_list bl; 804 struct bio_list bl;
804 unsigned long flags; 805 unsigned long flags;
805 mdk_rdev_t *blocked_rdev; 806 mdk_rdev_t *blocked_rdev;
806 807
807 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 808 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
808 md_barrier_request(mddev, bio); 809 md_flush_request(mddev, bio);
809 return 0; 810 return 0;
810 } 811 }
811 812
@@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
965 conf->mirrors[d].rdev->data_offset; 966 conf->mirrors[d].rdev->data_offset;
966 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 967 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
967 mbio->bi_end_io = raid10_end_write_request; 968 mbio->bi_end_io = raid10_end_write_request;
968 mbio->bi_rw = WRITE | do_sync; 969 mbio->bi_rw = WRITE | do_sync | do_fua;
969 mbio->bi_private = r10_bio; 970 mbio->bi_private = r10_bio;
970 971
971 atomic_inc(&r10_bio->remaining); 972 atomic_inc(&r10_bio->remaining);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69b0a169e43d..31140d1259dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
506 int rw; 506 int rw;
507 struct bio *bi; 507 struct bio *bi;
508 mdk_rdev_t *rdev; 508 mdk_rdev_t *rdev;
509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
510 rw = WRITE; 510 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
511 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 511 rw = WRITE_FUA;
512 else
513 rw = WRITE;
514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
512 rw = READ; 515 rw = READ;
513 else 516 else
514 continue; 517 continue;
@@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1031 1034
1032 while (wbi && wbi->bi_sector < 1035 while (wbi && wbi->bi_sector <
1033 dev->sector + STRIPE_SECTORS) { 1036 dev->sector + STRIPE_SECTORS) {
1037 if (wbi->bi_rw & REQ_FUA)
1038 set_bit(R5_WantFUA, &dev->flags);
1034 tx = async_copy_data(1, wbi, dev->page, 1039 tx = async_copy_data(1, wbi, dev->page,
1035 dev->sector, tx); 1040 dev->sector, tx);
1036 wbi = r5_next_bio(wbi, dev->sector); 1041 wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1048 int pd_idx = sh->pd_idx; 1053 int pd_idx = sh->pd_idx;
1049 int qd_idx = sh->qd_idx; 1054 int qd_idx = sh->qd_idx;
1050 int i; 1055 int i;
1056 bool fua = false;
1051 1057
1052 pr_debug("%s: stripe %llu\n", __func__, 1058 pr_debug("%s: stripe %llu\n", __func__,
1053 (unsigned long long)sh->sector); 1059 (unsigned long long)sh->sector);
1054 1060
1061 for (i = disks; i--; )
1062 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1063
1055 for (i = disks; i--; ) { 1064 for (i = disks; i--; ) {
1056 struct r5dev *dev = &sh->dev[i]; 1065 struct r5dev *dev = &sh->dev[i];
1057 1066
1058 if (dev->written || i == pd_idx || i == qd_idx) 1067 if (dev->written || i == pd_idx || i == qd_idx) {
1059 set_bit(R5_UPTODATE, &dev->flags); 1068 set_bit(R5_UPTODATE, &dev->flags);
1069 if (fua)
1070 set_bit(R5_WantFUA, &dev->flags);
1071 }
1060 } 1072 }
1061 1073
1062 if (sh->reconstruct_state == reconstruct_state_drain_run) 1074 if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh)
3281 3293
3282 if (dec_preread_active) { 3294 if (dec_preread_active) {
3283 /* We delay this until after ops_run_io so that if make_request 3295 /* We delay this until after ops_run_io so that if make_request
3284 * is waiting on a barrier, it won't continue until the writes 3296 * is waiting on a flush, it won't continue until the writes
3285 * have actually been submitted. 3297 * have actually been submitted.
3286 */ 3298 */
3287 atomic_dec(&conf->preread_active_stripes); 3299 atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh)
3583 3595
3584 if (dec_preread_active) { 3596 if (dec_preread_active) {
3585 /* We delay this until after ops_run_io so that if make_request 3597 /* We delay this until after ops_run_io so that if make_request
3586 * is waiting on a barrier, it won't continue until the writes 3598 * is waiting on a flush, it won't continue until the writes
3587 * have actually been submitted. 3599 * have actually been submitted.
3588 */ 3600 */
3589 atomic_dec(&conf->preread_active_stripes); 3601 atomic_dec(&conf->preread_active_stripes);
@@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi)
3978 const int rw = bio_data_dir(bi); 3990 const int rw = bio_data_dir(bi);
3979 int remaining; 3991 int remaining;
3980 3992
3981 if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { 3993 if (unlikely(bi->bi_rw & REQ_FLUSH)) {
3982 /* Drain all pending writes. We only really need 3994 md_flush_request(mddev, bi);
3983 * to ensure they have been submitted, but this is
3984 * easier.
3985 */
3986 mddev->pers->quiesce(mddev, 1);
3987 mddev->pers->quiesce(mddev, 0);
3988 md_barrier_request(mddev, bi);
3989 return 0; 3995 return 0;
3990 } 3996 }
3991 3997
@@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4103 finish_wait(&conf->wait_for_overlap, &w); 4109 finish_wait(&conf->wait_for_overlap, &w);
4104 set_bit(STRIPE_HANDLE, &sh->state); 4110 set_bit(STRIPE_HANDLE, &sh->state);
4105 clear_bit(STRIPE_DELAYED, &sh->state); 4111 clear_bit(STRIPE_DELAYED, &sh->state);
4106 if (mddev->barrier && 4112 if ((bi->bi_rw & REQ_SYNC) &&
4107 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4113 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4108 atomic_inc(&conf->preread_active_stripes); 4114 atomic_inc(&conf->preread_active_stripes);
4109 release_stripe(sh); 4115 release_stripe(sh);
@@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4126 bio_endio(bi, 0); 4132 bio_endio(bi, 0);
4127 } 4133 }
4128 4134
4129 if (mddev->barrier) {
4130 /* We need to wait for the stripes to all be handled.
4131 * So: wait for preread_active_stripes to drop to 0.
4132 */
4133 wait_event(mddev->thread->wqueue,
4134 atomic_read(&conf->preread_active_stripes) == 0);
4135 }
4136 return 0; 4135 return 0;
4137} 4136}
4138 4137
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 36eaed5dfd6e..2ace0582b409 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -275,6 +275,7 @@ struct r6_state {
275 * filling 275 * filling
276 */ 276 */
277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
278#define R5_WantFUA 14 /* Write should be FUA */
278/* 279/*
279 * Write method 280 * Write method
280 */ 281 */