aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-09-03 05:56:18 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-09-10 06:35:38 -0400
commite9c7469bb4f502dafc092166201bea1ad5fc0fbf (patch)
tree04202b0bb88623d3005c909eaafcb280778902da
parent7bc9fddab074d6bb630344e1969e28d20b140621 (diff)
md: implment REQ_FLUSH/FUA support
This patch converts md to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. In the core part (md.c), the following changes are notable. * Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA don't interfere with processing of other requests and thus there is no reason to mark the queue congested while FLUSH/FUA is in progress. * REQ_FLUSH/FUA failures are final and its users don't need retry logic. Retry logic is removed. * Preflush needs to be issued to all member devices but FUA writes can be handled the same way as other writes - their processing can be deferred to request_queue of member devices. md_barrier_request() is renamed to md_flush_request() and simplified accordingly. For linear, raid0 and multipath, the core changes are enough. raid1, 5 and 10 need the following conversions. * raid1: Handling of FLUSH/FUA bio's can simply be deferred to request_queues of member devices. Barrier related logic removed. * raid5: Queue draining logic dropped. FUA bit is propagated through biodrain and stripe resconstruction such that all the updated parts of the stripe are written out with FUA writes if any of the dirtying writes was FUA. preread_active_stripes handling in make_request() is updated as suggested by Neil Brown. * raid10: FUA bit needs to be propagated to write clones. linear, raid0, 1, 5 and 10 tested. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Neil Brown <neilb@suse.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r--drivers/md/linear.c4
-rw-r--r--drivers/md/md.c117
-rw-r--r--drivers/md/md.h23
-rw-r--r--drivers/md/multipath.c4
-rw-r--r--drivers/md/raid0.c4
-rw-r--r--drivers/md/raid1.c176
-rw-r--r--drivers/md/raid1.h2
-rw-r--r--drivers/md/raid10.c7
-rw-r--r--drivers/md/raid5.c43
-rw-r--r--drivers/md/raid5.h1
10 files changed, 122 insertions, 259 deletions
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ba19060bcf3f..8a2f767f26d8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
294 dev_info_t *tmp_dev; 294 dev_info_t *tmp_dev;
295 sector_t start_sector; 295 sector_t start_sector;
296 296
297 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 297 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
298 md_barrier_request(mddev, bio); 298 md_flush_request(mddev, bio);
299 return 0; 299 return 0;
300 } 300 }
301 301
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c148b6302154..3640f025cb72 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -226,12 +226,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
226 return 0; 226 return 0;
227 } 227 }
228 rcu_read_lock(); 228 rcu_read_lock();
229 if (mddev->suspended || mddev->barrier) { 229 if (mddev->suspended) {
230 DEFINE_WAIT(__wait); 230 DEFINE_WAIT(__wait);
231 for (;;) { 231 for (;;) {
232 prepare_to_wait(&mddev->sb_wait, &__wait, 232 prepare_to_wait(&mddev->sb_wait, &__wait,
233 TASK_UNINTERRUPTIBLE); 233 TASK_UNINTERRUPTIBLE);
234 if (!mddev->suspended && !mddev->barrier) 234 if (!mddev->suspended)
235 break; 235 break;
236 rcu_read_unlock(); 236 rcu_read_unlock();
237 schedule(); 237 schedule();
@@ -282,40 +282,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);
282 282
283int mddev_congested(mddev_t *mddev, int bits) 283int mddev_congested(mddev_t *mddev, int bits)
284{ 284{
285 if (mddev->barrier)
286 return 1;
287 return mddev->suspended; 285 return mddev->suspended;
288} 286}
289EXPORT_SYMBOL(mddev_congested); 287EXPORT_SYMBOL(mddev_congested);
290 288
291/* 289/*
292 * Generic barrier handling for md 290 * Generic flush handling for md
293 */ 291 */
294 292
295#define POST_REQUEST_BARRIER ((void*)1) 293static void md_end_flush(struct bio *bio, int err)
296
297static void md_end_barrier(struct bio *bio, int err)
298{ 294{
299 mdk_rdev_t *rdev = bio->bi_private; 295 mdk_rdev_t *rdev = bio->bi_private;
300 mddev_t *mddev = rdev->mddev; 296 mddev_t *mddev = rdev->mddev;
301 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
302 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
303 297
304 rdev_dec_pending(rdev, mddev); 298 rdev_dec_pending(rdev, mddev);
305 299
306 if (atomic_dec_and_test(&mddev->flush_pending)) { 300 if (atomic_dec_and_test(&mddev->flush_pending)) {
307 if (mddev->barrier == POST_REQUEST_BARRIER) { 301 /* The pre-request flush has finished */
308 /* This was a post-request barrier */ 302 schedule_work(&mddev->flush_work);
309 mddev->barrier = NULL;
310 wake_up(&mddev->sb_wait);
311 } else
312 /* The pre-request barrier has finished */
313 schedule_work(&mddev->barrier_work);
314 } 303 }
315 bio_put(bio); 304 bio_put(bio);
316} 305}
317 306
318static void submit_barriers(mddev_t *mddev) 307static void submit_flushes(mddev_t *mddev)
319{ 308{
320 mdk_rdev_t *rdev; 309 mdk_rdev_t *rdev;
321 310
@@ -332,60 +321,56 @@ static void submit_barriers(mddev_t *mddev)
332 atomic_inc(&rdev->nr_pending); 321 atomic_inc(&rdev->nr_pending);
333 rcu_read_unlock(); 322 rcu_read_unlock();
334 bi = bio_alloc(GFP_KERNEL, 0); 323 bi = bio_alloc(GFP_KERNEL, 0);
335 bi->bi_end_io = md_end_barrier; 324 bi->bi_end_io = md_end_flush;
336 bi->bi_private = rdev; 325 bi->bi_private = rdev;
337 bi->bi_bdev = rdev->bdev; 326 bi->bi_bdev = rdev->bdev;
338 atomic_inc(&mddev->flush_pending); 327 atomic_inc(&mddev->flush_pending);
339 submit_bio(WRITE_BARRIER, bi); 328 submit_bio(WRITE_FLUSH, bi);
340 rcu_read_lock(); 329 rcu_read_lock();
341 rdev_dec_pending(rdev, mddev); 330 rdev_dec_pending(rdev, mddev);
342 } 331 }
343 rcu_read_unlock(); 332 rcu_read_unlock();
344} 333}
345 334
346static void md_submit_barrier(struct work_struct *ws) 335static void md_submit_flush_data(struct work_struct *ws)
347{ 336{
348 mddev_t *mddev = container_of(ws, mddev_t, barrier_work); 337 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
349 struct bio *bio = mddev->barrier; 338 struct bio *bio = mddev->flush_bio;
350 339
351 atomic_set(&mddev->flush_pending, 1); 340 atomic_set(&mddev->flush_pending, 1);
352 341
353 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 342 if (bio->bi_size == 0)
354 bio_endio(bio, -EOPNOTSUPP);
355 else if (bio->bi_size == 0)
356 /* an empty barrier - all done */ 343 /* an empty barrier - all done */
357 bio_endio(bio, 0); 344 bio_endio(bio, 0);
358 else { 345 else {
359 bio->bi_rw &= ~REQ_HARDBARRIER; 346 bio->bi_rw &= ~REQ_FLUSH;
360 if (mddev->pers->make_request(mddev, bio)) 347 if (mddev->pers->make_request(mddev, bio))
361 generic_make_request(bio); 348 generic_make_request(bio);
362 mddev->barrier = POST_REQUEST_BARRIER;
363 submit_barriers(mddev);
364 } 349 }
365 if (atomic_dec_and_test(&mddev->flush_pending)) { 350 if (atomic_dec_and_test(&mddev->flush_pending)) {
366 mddev->barrier = NULL; 351 mddev->flush_bio = NULL;
367 wake_up(&mddev->sb_wait); 352 wake_up(&mddev->sb_wait);
368 } 353 }
369} 354}
370 355
371void md_barrier_request(mddev_t *mddev, struct bio *bio) 356void md_flush_request(mddev_t *mddev, struct bio *bio)
372{ 357{
373 spin_lock_irq(&mddev->write_lock); 358 spin_lock_irq(&mddev->write_lock);
374 wait_event_lock_irq(mddev->sb_wait, 359 wait_event_lock_irq(mddev->sb_wait,
375 !mddev->barrier, 360 !mddev->flush_bio,
376 mddev->write_lock, /*nothing*/); 361 mddev->write_lock, /*nothing*/);
377 mddev->barrier = bio; 362 mddev->flush_bio = bio;
378 spin_unlock_irq(&mddev->write_lock); 363 spin_unlock_irq(&mddev->write_lock);
379 364
380 atomic_set(&mddev->flush_pending, 1); 365 atomic_set(&mddev->flush_pending, 1);
381 INIT_WORK(&mddev->barrier_work, md_submit_barrier); 366 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
382 367
383 submit_barriers(mddev); 368 submit_flushes(mddev);
384 369
385 if (atomic_dec_and_test(&mddev->flush_pending)) 370 if (atomic_dec_and_test(&mddev->flush_pending))
386 schedule_work(&mddev->barrier_work); 371 schedule_work(&mddev->flush_work);
387} 372}
388EXPORT_SYMBOL(md_barrier_request); 373EXPORT_SYMBOL(md_flush_request);
389 374
390/* Support for plugging. 375/* Support for plugging.
391 * This mirrors the plugging support in request_queue, but does not 376 * This mirrors the plugging support in request_queue, but does not
@@ -696,31 +681,6 @@ static void super_written(struct bio *bio, int error)
696 bio_put(bio); 681 bio_put(bio);
697} 682}
698 683
699static void super_written_barrier(struct bio *bio, int error)
700{
701 struct bio *bio2 = bio->bi_private;
702 mdk_rdev_t *rdev = bio2->bi_private;
703 mddev_t *mddev = rdev->mddev;
704
705 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
706 error == -EOPNOTSUPP) {
707 unsigned long flags;
708 /* barriers don't appear to be supported :-( */
709 set_bit(BarriersNotsupp, &rdev->flags);
710 mddev->barriers_work = 0;
711 spin_lock_irqsave(&mddev->write_lock, flags);
712 bio2->bi_next = mddev->biolist;
713 mddev->biolist = bio2;
714 spin_unlock_irqrestore(&mddev->write_lock, flags);
715 wake_up(&mddev->sb_wait);
716 bio_put(bio);
717 } else {
718 bio_put(bio2);
719 bio->bi_private = rdev;
720 super_written(bio, error);
721 }
722}
723
724void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 684void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
725 sector_t sector, int size, struct page *page) 685 sector_t sector, int size, struct page *page)
726{ 686{
@@ -729,51 +689,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
729 * and decrement it on completion, waking up sb_wait 689 * and decrement it on completion, waking up sb_wait
730 * if zero is reached. 690 * if zero is reached.
731 * If an error occurred, call md_error 691 * If an error occurred, call md_error
732 *
733 * As we might need to resubmit the request if REQ_HARDBARRIER
734 * causes ENOTSUPP, we allocate a spare bio...
735 */ 692 */
736 struct bio *bio = bio_alloc(GFP_NOIO, 1); 693 struct bio *bio = bio_alloc(GFP_NOIO, 1);
737 int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
738 694
739 bio->bi_bdev = rdev->bdev; 695 bio->bi_bdev = rdev->bdev;
740 bio->bi_sector = sector; 696 bio->bi_sector = sector;
741 bio_add_page(bio, page, size, 0); 697 bio_add_page(bio, page, size, 0);
742 bio->bi_private = rdev; 698 bio->bi_private = rdev;
743 bio->bi_end_io = super_written; 699 bio->bi_end_io = super_written;
744 bio->bi_rw = rw;
745 700
746 atomic_inc(&mddev->pending_writes); 701 atomic_inc(&mddev->pending_writes);
747 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 702 submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
748 struct bio *rbio; 703 bio);
749 rw |= REQ_HARDBARRIER;
750 rbio = bio_clone(bio, GFP_NOIO);
751 rbio->bi_private = bio;
752 rbio->bi_end_io = super_written_barrier;
753 submit_bio(rw, rbio);
754 } else
755 submit_bio(rw, bio);
756} 704}
757 705
758void md_super_wait(mddev_t *mddev) 706void md_super_wait(mddev_t *mddev)
759{ 707{
760 /* wait for all superblock writes that were scheduled to complete. 708 /* wait for all superblock writes that were scheduled to complete */
761 * if any had to be retried (due to BARRIER problems), retry them
762 */
763 DEFINE_WAIT(wq); 709 DEFINE_WAIT(wq);
764 for(;;) { 710 for(;;) {
765 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 711 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
766 if (atomic_read(&mddev->pending_writes)==0) 712 if (atomic_read(&mddev->pending_writes)==0)
767 break; 713 break;
768 while (mddev->biolist) {
769 struct bio *bio;
770 spin_lock_irq(&mddev->write_lock);
771 bio = mddev->biolist;
772 mddev->biolist = bio->bi_next ;
773 bio->bi_next = NULL;
774 spin_unlock_irq(&mddev->write_lock);
775 submit_bio(bio->bi_rw, bio);
776 }
777 schedule(); 714 schedule();
778 } 715 }
779 finish_wait(&mddev->sb_wait, &wq); 716 finish_wait(&mddev->sb_wait, &wq);
@@ -1070,7 +1007,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1070 clear_bit(Faulty, &rdev->flags); 1007 clear_bit(Faulty, &rdev->flags);
1071 clear_bit(In_sync, &rdev->flags); 1008 clear_bit(In_sync, &rdev->flags);
1072 clear_bit(WriteMostly, &rdev->flags); 1009 clear_bit(WriteMostly, &rdev->flags);
1073 clear_bit(BarriersNotsupp, &rdev->flags);
1074 1010
1075 if (mddev->raid_disks == 0) { 1011 if (mddev->raid_disks == 0) {
1076 mddev->major_version = 0; 1012 mddev->major_version = 0;
@@ -1485,7 +1421,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1485 clear_bit(Faulty, &rdev->flags); 1421 clear_bit(Faulty, &rdev->flags);
1486 clear_bit(In_sync, &rdev->flags); 1422 clear_bit(In_sync, &rdev->flags);
1487 clear_bit(WriteMostly, &rdev->flags); 1423 clear_bit(WriteMostly, &rdev->flags);
1488 clear_bit(BarriersNotsupp, &rdev->flags);
1489 1424
1490 if (mddev->raid_disks == 0) { 1425 if (mddev->raid_disks == 0) {
1491 mddev->major_version = 1; 1426 mddev->major_version = 1;
@@ -4506,7 +4441,6 @@ int md_run(mddev_t *mddev)
4506 /* may be over-ridden by personality */ 4441 /* may be over-ridden by personality */
4507 mddev->resync_max_sectors = mddev->dev_sectors; 4442 mddev->resync_max_sectors = mddev->dev_sectors;
4508 4443
4509 mddev->barriers_work = 1;
4510 mddev->ok_start_degraded = start_dirty_degraded; 4444 mddev->ok_start_degraded = start_dirty_degraded;
4511 4445
4512 if (start_readonly && mddev->ro == 0) 4446 if (start_readonly && mddev->ro == 0)
@@ -4685,7 +4619,6 @@ static void md_clean(mddev_t *mddev)
4685 mddev->recovery = 0; 4619 mddev->recovery = 0;
4686 mddev->in_sync = 0; 4620 mddev->in_sync = 0;
4687 mddev->degraded = 0; 4621 mddev->degraded = 0;
4688 mddev->barriers_work = 0;
4689 mddev->safemode = 0; 4622 mddev->safemode = 0;
4690 mddev->bitmap_info.offset = 0; 4623 mddev->bitmap_info.offset = 0;
4691 mddev->bitmap_info.default_offset = 0; 4624 mddev->bitmap_info.default_offset = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a953fe2808ae..d8e2ab25103b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -87,7 +87,6 @@ struct mdk_rdev_s
87#define Faulty 1 /* device is known to have a fault */ 87#define Faulty 1 /* device is known to have a fault */
88#define In_sync 2 /* device is in_sync with rest of array */ 88#define In_sync 2 /* device is in_sync with rest of array */
89#define WriteMostly 4 /* Avoid reading if at all possible */ 89#define WriteMostly 4 /* Avoid reading if at all possible */
90#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */
91#define AllReserved 6 /* If whole device is reserved for 90#define AllReserved 6 /* If whole device is reserved for
92 * one array */ 91 * one array */
93#define AutoDetected 7 /* added by auto-detect */ 92#define AutoDetected 7 /* added by auto-detect */
@@ -273,13 +272,6 @@ struct mddev_s
273 int degraded; /* whether md should consider 272 int degraded; /* whether md should consider
274 * adding a spare 273 * adding a spare
275 */ 274 */
276 int barriers_work; /* initialised to true, cleared as soon
277 * as a barrier request to slave
278 * fails. Only supported
279 */
280 struct bio *biolist; /* bios that need to be retried
281 * because REQ_HARDBARRIER is not supported
282 */
283 275
284 atomic_t recovery_active; /* blocks scheduled, but not written */ 276 atomic_t recovery_active; /* blocks scheduled, but not written */
285 wait_queue_head_t recovery_wait; 277 wait_queue_head_t recovery_wait;
@@ -339,16 +331,13 @@ struct mddev_s
339 struct attribute_group *to_remove; 331 struct attribute_group *to_remove;
340 struct plug_handle *plug; /* if used by personality */ 332 struct plug_handle *plug; /* if used by personality */
341 333
342 /* Generic barrier handling. 334 /* Generic flush handling.
343 * If there is a pending barrier request, all other 335 * The last to finish preflush schedules a worker to submit
344 * writes are blocked while the devices are flushed. 336 * the rest of the request (without the REQ_FLUSH flag).
345 * The last to finish a flush schedules a worker to
346 * submit the barrier request (without the barrier flag),
347 * then submit more flush requests.
348 */ 337 */
349 struct bio *barrier; 338 struct bio *flush_bio;
350 atomic_t flush_pending; 339 atomic_t flush_pending;
351 struct work_struct barrier_work; 340 struct work_struct flush_work;
352 struct work_struct event_work; /* used by dm to report failure event */ 341 struct work_struct event_work; /* used by dm to report failure event */
353}; 342};
354 343
@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
502extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 491extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
503 492
504extern int mddev_congested(mddev_t *mddev, int bits); 493extern int mddev_congested(mddev_t *mddev, int bits);
505extern void md_barrier_request(mddev_t *mddev, struct bio *bio); 494extern void md_flush_request(mddev_t *mddev, struct bio *bio);
506extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 495extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
507 sector_t sector, int size, struct page *page); 496 sector_t sector, int size, struct page *page);
508extern void md_super_wait(mddev_t *mddev); 497extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0307d217e7a4..6d7ddf32ef2e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
142 struct multipath_bh * mp_bh; 142 struct multipath_bh * mp_bh;
143 struct multipath_info *multipath; 143 struct multipath_info *multipath;
144 144
145 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 145 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
146 md_barrier_request(mddev, bio); 146 md_flush_request(mddev, bio);
147 return 0; 147 return 0;
148 } 148 }
149 149
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f7af46d623c..a39f4c355e55 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
483 struct strip_zone *zone; 483 struct strip_zone *zone;
484 mdk_rdev_t *tmp_dev; 484 mdk_rdev_t *tmp_dev;
485 485
486 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 486 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
487 md_barrier_request(mddev, bio); 487 md_flush_request(mddev, bio);
488 return 0; 488 return 0;
489 } 489 }
490 490
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ad83a4dcadc3..886a9d865488 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
319 if (r1_bio->bios[mirror] == bio) 319 if (r1_bio->bios[mirror] == bio)
320 break; 320 break;
321 321
322 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { 322 /*
323 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); 323 * 'one mirror IO has finished' event handler:
324 set_bit(R1BIO_BarrierRetry, &r1_bio->state); 324 */
325 r1_bio->mddev->barriers_work = 0; 325 r1_bio->bios[mirror] = NULL;
326 /* Don't rdev_dec_pending in this branch - keep it for the retry */ 326 to_put = bio;
327 } else { 327 if (!uptodate) {
328 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
329 /* an I/O failed, we can't clear the bitmap */
330 set_bit(R1BIO_Degraded, &r1_bio->state);
331 } else
328 /* 332 /*
329 * this branch is our 'one mirror IO has finished' event handler: 333 * Set R1BIO_Uptodate in our master bio, so that we
334 * will return a good error code for to the higher
335 * levels even if IO on some other mirrored buffer
336 * fails.
337 *
338 * The 'master' represents the composite IO operation
339 * to user-side. So if something waits for IO, then it
340 * will wait for the 'master' bio.
330 */ 341 */
331 r1_bio->bios[mirror] = NULL; 342 set_bit(R1BIO_Uptodate, &r1_bio->state);
332 to_put = bio; 343
333 if (!uptodate) { 344 update_head_pos(mirror, r1_bio);
334 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 345
335 /* an I/O failed, we can't clear the bitmap */ 346 if (behind) {
336 set_bit(R1BIO_Degraded, &r1_bio->state); 347 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
337 } else 348 atomic_dec(&r1_bio->behind_remaining);
338 /* 349
339 * Set R1BIO_Uptodate in our master bio, so that 350 /*
340 * we will return a good error code for to the higher 351 * In behind mode, we ACK the master bio once the I/O
341 * levels even if IO on some other mirrored buffer fails. 352 * has safely reached all non-writemostly
342 * 353 * disks. Setting the Returned bit ensures that this
343 * The 'master' represents the composite IO operation to 354 * gets done only once -- we don't ever want to return
344 * user-side. So if something waits for IO, then it will 355 * -EIO here, instead we'll wait
345 * wait for the 'master' bio. 356 */
346 */ 357 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
347 set_bit(R1BIO_Uptodate, &r1_bio->state); 358 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
348 359 /* Maybe we can return now */
349 update_head_pos(mirror, r1_bio); 360 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
350 361 struct bio *mbio = r1_bio->master_bio;
351 if (behind) { 362 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
352 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 363 (unsigned long long) mbio->bi_sector,
353 atomic_dec(&r1_bio->behind_remaining); 364 (unsigned long long) mbio->bi_sector +
354 365 (mbio->bi_size >> 9) - 1);
355 /* In behind mode, we ACK the master bio once the I/O has safely 366 bio_endio(mbio, 0);
356 * reached all non-writemostly disks. Setting the Returned bit
357 * ensures that this gets done only once -- we don't ever want to
358 * return -EIO here, instead we'll wait */
359
360 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
361 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
362 /* Maybe we can return now */
363 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
364 struct bio *mbio = r1_bio->master_bio;
365 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
366 (unsigned long long) mbio->bi_sector,
367 (unsigned long long) mbio->bi_sector +
368 (mbio->bi_size >> 9) - 1);
369 bio_endio(mbio, 0);
370 }
371 } 367 }
372 } 368 }
373 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
374 } 369 }
370 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
371
375 /* 372 /*
376 *
377 * Let's see if all mirrored write operations have finished 373 * Let's see if all mirrored write operations have finished
378 * already. 374 * already.
379 */ 375 */
380 if (atomic_dec_and_test(&r1_bio->remaining)) { 376 if (atomic_dec_and_test(&r1_bio->remaining)) {
381 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) 377 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
382 reschedule_retry(r1_bio); 378 /* free extra copy of the data pages */
383 else { 379 int i = bio->bi_vcnt;
384 /* it really is the end of this request */ 380 while (i--)
385 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 381 safe_put_page(bio->bi_io_vec[i].bv_page);
386 /* free extra copy of the data pages */
387 int i = bio->bi_vcnt;
388 while (i--)
389 safe_put_page(bio->bi_io_vec[i].bv_page);
390 }
391 /* clear the bitmap if all writes complete successfully */
392 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
393 r1_bio->sectors,
394 !test_bit(R1BIO_Degraded, &r1_bio->state),
395 behind);
396 md_write_end(r1_bio->mddev);
397 raid_end_bio_io(r1_bio);
398 } 382 }
383 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
385 r1_bio->sectors,
386 !test_bit(R1BIO_Degraded, &r1_bio->state),
387 behind);
388 md_write_end(r1_bio->mddev);
389 raid_end_bio_io(r1_bio);
399 } 390 }
400 391
401 if (to_put) 392 if (to_put)
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
788 struct page **behind_pages = NULL; 779 struct page **behind_pages = NULL;
789 const int rw = bio_data_dir(bio); 780 const int rw = bio_data_dir(bio);
790 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 781 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
791 unsigned long do_barriers; 782 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
792 mdk_rdev_t *blocked_rdev; 783 mdk_rdev_t *blocked_rdev;
793 784
794 /* 785 /*
795 * Register the new request and wait if the reconstruction 786 * Register the new request and wait if the reconstruction
796 * thread has put up a bar for new requests. 787 * thread has put up a bar for new requests.
797 * Continue immediately if no resync is active currently. 788 * Continue immediately if no resync is active currently.
798 * We test barriers_work *after* md_write_start as md_write_start
799 * may cause the first superblock write, and that will check out
800 * if barriers work.
801 */ 789 */
802 790
803 md_write_start(mddev, bio); /* wait on superblock update early */ 791 md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 } 809 }
822 finish_wait(&conf->wait_barrier, &w); 810 finish_wait(&conf->wait_barrier, &w);
823 } 811 }
824 if (unlikely(!mddev->barriers_work &&
825 (bio->bi_rw & REQ_HARDBARRIER))) {
826 if (rw == WRITE)
827 md_write_end(mddev);
828 bio_endio(bio, -EOPNOTSUPP);
829 return 0;
830 }
831 812
832 wait_barrier(conf); 813 wait_barrier(conf);
833 814
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
959 atomic_set(&r1_bio->remaining, 0); 940 atomic_set(&r1_bio->remaining, 0);
960 atomic_set(&r1_bio->behind_remaining, 0); 941 atomic_set(&r1_bio->behind_remaining, 0);
961 942
962 do_barriers = bio->bi_rw & REQ_HARDBARRIER;
963 if (do_barriers)
964 set_bit(R1BIO_Barrier, &r1_bio->state);
965
966 bio_list_init(&bl); 943 bio_list_init(&bl);
967 for (i = 0; i < disks; i++) { 944 for (i = 0; i < disks; i++) {
968 struct bio *mbio; 945 struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
975 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 952 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
976 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 953 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
977 mbio->bi_end_io = raid1_end_write_request; 954 mbio->bi_end_io = raid1_end_write_request;
978 mbio->bi_rw = WRITE | do_barriers | do_sync; 955 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
979 mbio->bi_private = r1_bio; 956 mbio->bi_private = r1_bio;
980 957
981 if (behind_pages) { 958 if (behind_pages) {
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
1634 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1611 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1635 sync_request_write(mddev, r1_bio); 1612 sync_request_write(mddev, r1_bio);
1636 unplug = 1; 1613 unplug = 1;
1637 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1638 /* some requests in the r1bio were REQ_HARDBARRIER
1639 * requests which failed with -EOPNOTSUPP. Hohumm..
1640 * Better resubmit without the barrier.
1641 * We know which devices to resubmit for, because
1642 * all others have had their bios[] entry cleared.
1643 * We already have a nr_pending reference on these rdevs.
1644 */
1645 int i;
1646 const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
1647 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1648 clear_bit(R1BIO_Barrier, &r1_bio->state);
1649 for (i=0; i < conf->raid_disks; i++)
1650 if (r1_bio->bios[i])
1651 atomic_inc(&r1_bio->remaining);
1652 for (i=0; i < conf->raid_disks; i++)
1653 if (r1_bio->bios[i]) {
1654 struct bio_vec *bvec;
1655 int j;
1656
1657 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1658 /* copy pages from the failed bio, as
1659 * this might be a write-behind device */
1660 __bio_for_each_segment(bvec, bio, j, 0)
1661 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1662 bio_put(r1_bio->bios[i]);
1663 bio->bi_sector = r1_bio->sector +
1664 conf->mirrors[i].rdev->data_offset;
1665 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1666 bio->bi_end_io = raid1_end_write_request;
1667 bio->bi_rw = WRITE | do_sync;
1668 bio->bi_private = r1_bio;
1669 r1_bio->bios[i] = bio;
1670 generic_make_request(bio);
1671 }
1672 } else { 1614 } else {
1673 int disk; 1615 int disk;
1674 1616
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5f2d443ae28a..adf8cfd73313 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -117,8 +117,6 @@ struct r1bio_s {
117#define R1BIO_IsSync 1 117#define R1BIO_IsSync 1
118#define R1BIO_Degraded 2 118#define R1BIO_Degraded 2
119#define R1BIO_BehindIO 3 119#define R1BIO_BehindIO 3
120#define R1BIO_Barrier 4
121#define R1BIO_BarrierRetry 5
122/* For write-behind requests, we call bi_end_io when 120/* For write-behind requests, we call bi_end_io when
123 * the last non-write-behind device completes, providing 121 * the last non-write-behind device completes, providing
124 * any write was successful. Otherwise we call when 122 * any write was successful. Otherwise we call when
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124d..f0d082f749be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -800,12 +800,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
800 int chunk_sects = conf->chunk_mask + 1; 800 int chunk_sects = conf->chunk_mask + 1;
801 const int rw = bio_data_dir(bio); 801 const int rw = bio_data_dir(bio);
802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
803 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
803 struct bio_list bl; 804 struct bio_list bl;
804 unsigned long flags; 805 unsigned long flags;
805 mdk_rdev_t *blocked_rdev; 806 mdk_rdev_t *blocked_rdev;
806 807
807 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 808 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
808 md_barrier_request(mddev, bio); 809 md_flush_request(mddev, bio);
809 return 0; 810 return 0;
810 } 811 }
811 812
@@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
965 conf->mirrors[d].rdev->data_offset; 966 conf->mirrors[d].rdev->data_offset;
966 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 967 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
967 mbio->bi_end_io = raid10_end_write_request; 968 mbio->bi_end_io = raid10_end_write_request;
968 mbio->bi_rw = WRITE | do_sync; 969 mbio->bi_rw = WRITE | do_sync | do_fua;
969 mbio->bi_private = r10_bio; 970 mbio->bi_private = r10_bio;
970 971
971 atomic_inc(&r10_bio->remaining); 972 atomic_inc(&r10_bio->remaining);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69b0a169e43d..31140d1259dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
506 int rw; 506 int rw;
507 struct bio *bi; 507 struct bio *bi;
508 mdk_rdev_t *rdev; 508 mdk_rdev_t *rdev;
509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
510 rw = WRITE; 510 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
511 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 511 rw = WRITE_FUA;
512 else
513 rw = WRITE;
514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
512 rw = READ; 515 rw = READ;
513 else 516 else
514 continue; 517 continue;
@@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1031 1034
1032 while (wbi && wbi->bi_sector < 1035 while (wbi && wbi->bi_sector <
1033 dev->sector + STRIPE_SECTORS) { 1036 dev->sector + STRIPE_SECTORS) {
1037 if (wbi->bi_rw & REQ_FUA)
1038 set_bit(R5_WantFUA, &dev->flags);
1034 tx = async_copy_data(1, wbi, dev->page, 1039 tx = async_copy_data(1, wbi, dev->page,
1035 dev->sector, tx); 1040 dev->sector, tx);
1036 wbi = r5_next_bio(wbi, dev->sector); 1041 wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1048 int pd_idx = sh->pd_idx; 1053 int pd_idx = sh->pd_idx;
1049 int qd_idx = sh->qd_idx; 1054 int qd_idx = sh->qd_idx;
1050 int i; 1055 int i;
1056 bool fua = false;
1051 1057
1052 pr_debug("%s: stripe %llu\n", __func__, 1058 pr_debug("%s: stripe %llu\n", __func__,
1053 (unsigned long long)sh->sector); 1059 (unsigned long long)sh->sector);
1054 1060
1061 for (i = disks; i--; )
1062 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1063
1055 for (i = disks; i--; ) { 1064 for (i = disks; i--; ) {
1056 struct r5dev *dev = &sh->dev[i]; 1065 struct r5dev *dev = &sh->dev[i];
1057 1066
1058 if (dev->written || i == pd_idx || i == qd_idx) 1067 if (dev->written || i == pd_idx || i == qd_idx) {
1059 set_bit(R5_UPTODATE, &dev->flags); 1068 set_bit(R5_UPTODATE, &dev->flags);
1069 if (fua)
1070 set_bit(R5_WantFUA, &dev->flags);
1071 }
1060 } 1072 }
1061 1073
1062 if (sh->reconstruct_state == reconstruct_state_drain_run) 1074 if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh)
3281 3293
3282 if (dec_preread_active) { 3294 if (dec_preread_active) {
3283 /* We delay this until after ops_run_io so that if make_request 3295 /* We delay this until after ops_run_io so that if make_request
3284 * is waiting on a barrier, it won't continue until the writes 3296 * is waiting on a flush, it won't continue until the writes
3285 * have actually been submitted. 3297 * have actually been submitted.
3286 */ 3298 */
3287 atomic_dec(&conf->preread_active_stripes); 3299 atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh)
3583 3595
3584 if (dec_preread_active) { 3596 if (dec_preread_active) {
3585 /* We delay this until after ops_run_io so that if make_request 3597 /* We delay this until after ops_run_io so that if make_request
3586 * is waiting on a barrier, it won't continue until the writes 3598 * is waiting on a flush, it won't continue until the writes
3587 * have actually been submitted. 3599 * have actually been submitted.
3588 */ 3600 */
3589 atomic_dec(&conf->preread_active_stripes); 3601 atomic_dec(&conf->preread_active_stripes);
@@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi)
3978 const int rw = bio_data_dir(bi); 3990 const int rw = bio_data_dir(bi);
3979 int remaining; 3991 int remaining;
3980 3992
3981 if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { 3993 if (unlikely(bi->bi_rw & REQ_FLUSH)) {
3982 /* Drain all pending writes. We only really need 3994 md_flush_request(mddev, bi);
3983 * to ensure they have been submitted, but this is
3984 * easier.
3985 */
3986 mddev->pers->quiesce(mddev, 1);
3987 mddev->pers->quiesce(mddev, 0);
3988 md_barrier_request(mddev, bi);
3989 return 0; 3995 return 0;
3990 } 3996 }
3991 3997
@@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4103 finish_wait(&conf->wait_for_overlap, &w); 4109 finish_wait(&conf->wait_for_overlap, &w);
4104 set_bit(STRIPE_HANDLE, &sh->state); 4110 set_bit(STRIPE_HANDLE, &sh->state);
4105 clear_bit(STRIPE_DELAYED, &sh->state); 4111 clear_bit(STRIPE_DELAYED, &sh->state);
4106 if (mddev->barrier && 4112 if ((bi->bi_rw & REQ_SYNC) &&
4107 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4113 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4108 atomic_inc(&conf->preread_active_stripes); 4114 atomic_inc(&conf->preread_active_stripes);
4109 release_stripe(sh); 4115 release_stripe(sh);
@@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4126 bio_endio(bi, 0); 4132 bio_endio(bi, 0);
4127 } 4133 }
4128 4134
4129 if (mddev->barrier) {
4130 /* We need to wait for the stripes to all be handled.
4131 * So: wait for preread_active_stripes to drop to 0.
4132 */
4133 wait_event(mddev->thread->wqueue,
4134 atomic_read(&conf->preread_active_stripes) == 0);
4135 }
4136 return 0; 4135 return 0;
4137} 4136}
4138 4137
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 36eaed5dfd6e..2ace0582b409 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -275,6 +275,7 @@ struct r6_state {
275 * filling 275 * filling
276 */ 276 */
277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
278#define R5_WantFUA 14 /* Write should be FUA */
278/* 279/*
279 * Write method 280 * Write method
280 */ 281 */