aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/md/linear.c4
-rw-r--r--drivers/md/md.c117
-rw-r--r--drivers/md/md.h23
-rw-r--r--drivers/md/multipath.c4
-rw-r--r--drivers/md/raid0.c4
-rw-r--r--drivers/md/raid1.c176
-rw-r--r--drivers/md/raid1.h2
-rw-r--r--drivers/md/raid10.c7
-rw-r--r--drivers/md/raid5.c43
-rw-r--r--drivers/md/raid5.h1
10 files changed, 122 insertions, 259 deletions
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ba19060bcf3f..8a2f767f26d8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
294 dev_info_t *tmp_dev; 294 dev_info_t *tmp_dev;
295 sector_t start_sector; 295 sector_t start_sector;
296 296
297 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 297 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
298 md_barrier_request(mddev, bio); 298 md_flush_request(mddev, bio);
299 return 0; 299 return 0;
300 } 300 }
301 301
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c148b6302154..3640f025cb72 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -226,12 +226,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
226 return 0; 226 return 0;
227 } 227 }
228 rcu_read_lock(); 228 rcu_read_lock();
229 if (mddev->suspended || mddev->barrier) { 229 if (mddev->suspended) {
230 DEFINE_WAIT(__wait); 230 DEFINE_WAIT(__wait);
231 for (;;) { 231 for (;;) {
232 prepare_to_wait(&mddev->sb_wait, &__wait, 232 prepare_to_wait(&mddev->sb_wait, &__wait,
233 TASK_UNINTERRUPTIBLE); 233 TASK_UNINTERRUPTIBLE);
234 if (!mddev->suspended && !mddev->barrier) 234 if (!mddev->suspended)
235 break; 235 break;
236 rcu_read_unlock(); 236 rcu_read_unlock();
237 schedule(); 237 schedule();
@@ -282,40 +282,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);
282 282
283int mddev_congested(mddev_t *mddev, int bits) 283int mddev_congested(mddev_t *mddev, int bits)
284{ 284{
285 if (mddev->barrier)
286 return 1;
287 return mddev->suspended; 285 return mddev->suspended;
288} 286}
289EXPORT_SYMBOL(mddev_congested); 287EXPORT_SYMBOL(mddev_congested);
290 288
291/* 289/*
292 * Generic barrier handling for md 290 * Generic flush handling for md
293 */ 291 */
294 292
295#define POST_REQUEST_BARRIER ((void*)1) 293static void md_end_flush(struct bio *bio, int err)
296
297static void md_end_barrier(struct bio *bio, int err)
298{ 294{
299 mdk_rdev_t *rdev = bio->bi_private; 295 mdk_rdev_t *rdev = bio->bi_private;
300 mddev_t *mddev = rdev->mddev; 296 mddev_t *mddev = rdev->mddev;
301 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
302 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
303 297
304 rdev_dec_pending(rdev, mddev); 298 rdev_dec_pending(rdev, mddev);
305 299
306 if (atomic_dec_and_test(&mddev->flush_pending)) { 300 if (atomic_dec_and_test(&mddev->flush_pending)) {
307 if (mddev->barrier == POST_REQUEST_BARRIER) { 301 /* The pre-request flush has finished */
308 /* This was a post-request barrier */ 302 schedule_work(&mddev->flush_work);
309 mddev->barrier = NULL;
310 wake_up(&mddev->sb_wait);
311 } else
312 /* The pre-request barrier has finished */
313 schedule_work(&mddev->barrier_work);
314 } 303 }
315 bio_put(bio); 304 bio_put(bio);
316} 305}
317 306
318static void submit_barriers(mddev_t *mddev) 307static void submit_flushes(mddev_t *mddev)
319{ 308{
320 mdk_rdev_t *rdev; 309 mdk_rdev_t *rdev;
321 310
@@ -332,60 +321,56 @@ static void submit_barriers(mddev_t *mddev)
332 atomic_inc(&rdev->nr_pending); 321 atomic_inc(&rdev->nr_pending);
333 rcu_read_unlock(); 322 rcu_read_unlock();
334 bi = bio_alloc(GFP_KERNEL, 0); 323 bi = bio_alloc(GFP_KERNEL, 0);
335 bi->bi_end_io = md_end_barrier; 324 bi->bi_end_io = md_end_flush;
336 bi->bi_private = rdev; 325 bi->bi_private = rdev;
337 bi->bi_bdev = rdev->bdev; 326 bi->bi_bdev = rdev->bdev;
338 atomic_inc(&mddev->flush_pending); 327 atomic_inc(&mddev->flush_pending);
339 submit_bio(WRITE_BARRIER, bi); 328 submit_bio(WRITE_FLUSH, bi);
340 rcu_read_lock(); 329 rcu_read_lock();
341 rdev_dec_pending(rdev, mddev); 330 rdev_dec_pending(rdev, mddev);
342 } 331 }
343 rcu_read_unlock(); 332 rcu_read_unlock();
344} 333}
345 334
346static void md_submit_barrier(struct work_struct *ws) 335static void md_submit_flush_data(struct work_struct *ws)
347{ 336{
348 mddev_t *mddev = container_of(ws, mddev_t, barrier_work); 337 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
349 struct bio *bio = mddev->barrier; 338 struct bio *bio = mddev->flush_bio;
350 339
351 atomic_set(&mddev->flush_pending, 1); 340 atomic_set(&mddev->flush_pending, 1);
352 341
353 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 342 if (bio->bi_size == 0)
354 bio_endio(bio, -EOPNOTSUPP);
355 else if (bio->bi_size == 0)
356 /* an empty barrier - all done */ 343 /* an empty barrier - all done */
357 bio_endio(bio, 0); 344 bio_endio(bio, 0);
358 else { 345 else {
359 bio->bi_rw &= ~REQ_HARDBARRIER; 346 bio->bi_rw &= ~REQ_FLUSH;
360 if (mddev->pers->make_request(mddev, bio)) 347 if (mddev->pers->make_request(mddev, bio))
361 generic_make_request(bio); 348 generic_make_request(bio);
362 mddev->barrier = POST_REQUEST_BARRIER;
363 submit_barriers(mddev);
364 } 349 }
365 if (atomic_dec_and_test(&mddev->flush_pending)) { 350 if (atomic_dec_and_test(&mddev->flush_pending)) {
366 mddev->barrier = NULL; 351 mddev->flush_bio = NULL;
367 wake_up(&mddev->sb_wait); 352 wake_up(&mddev->sb_wait);
368 } 353 }
369} 354}
370 355
371void md_barrier_request(mddev_t *mddev, struct bio *bio) 356void md_flush_request(mddev_t *mddev, struct bio *bio)
372{ 357{
373 spin_lock_irq(&mddev->write_lock); 358 spin_lock_irq(&mddev->write_lock);
374 wait_event_lock_irq(mddev->sb_wait, 359 wait_event_lock_irq(mddev->sb_wait,
375 !mddev->barrier, 360 !mddev->flush_bio,
376 mddev->write_lock, /*nothing*/); 361 mddev->write_lock, /*nothing*/);
377 mddev->barrier = bio; 362 mddev->flush_bio = bio;
378 spin_unlock_irq(&mddev->write_lock); 363 spin_unlock_irq(&mddev->write_lock);
379 364
380 atomic_set(&mddev->flush_pending, 1); 365 atomic_set(&mddev->flush_pending, 1);
381 INIT_WORK(&mddev->barrier_work, md_submit_barrier); 366 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
382 367
383 submit_barriers(mddev); 368 submit_flushes(mddev);
384 369
385 if (atomic_dec_and_test(&mddev->flush_pending)) 370 if (atomic_dec_and_test(&mddev->flush_pending))
386 schedule_work(&mddev->barrier_work); 371 schedule_work(&mddev->flush_work);
387} 372}
388EXPORT_SYMBOL(md_barrier_request); 373EXPORT_SYMBOL(md_flush_request);
389 374
390/* Support for plugging. 375/* Support for plugging.
391 * This mirrors the plugging support in request_queue, but does not 376 * This mirrors the plugging support in request_queue, but does not
@@ -696,31 +681,6 @@ static void super_written(struct bio *bio, int error)
696 bio_put(bio); 681 bio_put(bio);
697} 682}
698 683
699static void super_written_barrier(struct bio *bio, int error)
700{
701 struct bio *bio2 = bio->bi_private;
702 mdk_rdev_t *rdev = bio2->bi_private;
703 mddev_t *mddev = rdev->mddev;
704
705 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
706 error == -EOPNOTSUPP) {
707 unsigned long flags;
708 /* barriers don't appear to be supported :-( */
709 set_bit(BarriersNotsupp, &rdev->flags);
710 mddev->barriers_work = 0;
711 spin_lock_irqsave(&mddev->write_lock, flags);
712 bio2->bi_next = mddev->biolist;
713 mddev->biolist = bio2;
714 spin_unlock_irqrestore(&mddev->write_lock, flags);
715 wake_up(&mddev->sb_wait);
716 bio_put(bio);
717 } else {
718 bio_put(bio2);
719 bio->bi_private = rdev;
720 super_written(bio, error);
721 }
722}
723
724void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 684void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
725 sector_t sector, int size, struct page *page) 685 sector_t sector, int size, struct page *page)
726{ 686{
@@ -729,51 +689,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
729 * and decrement it on completion, waking up sb_wait 689 * and decrement it on completion, waking up sb_wait
730 * if zero is reached. 690 * if zero is reached.
731 * If an error occurred, call md_error 691 * If an error occurred, call md_error
732 *
733 * As we might need to resubmit the request if REQ_HARDBARRIER
734 * causes ENOTSUPP, we allocate a spare bio...
735 */ 692 */
736 struct bio *bio = bio_alloc(GFP_NOIO, 1); 693 struct bio *bio = bio_alloc(GFP_NOIO, 1);
737 int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
738 694
739 bio->bi_bdev = rdev->bdev; 695 bio->bi_bdev = rdev->bdev;
740 bio->bi_sector = sector; 696 bio->bi_sector = sector;
741 bio_add_page(bio, page, size, 0); 697 bio_add_page(bio, page, size, 0);
742 bio->bi_private = rdev; 698 bio->bi_private = rdev;
743 bio->bi_end_io = super_written; 699 bio->bi_end_io = super_written;
744 bio->bi_rw = rw;
745 700
746 atomic_inc(&mddev->pending_writes); 701 atomic_inc(&mddev->pending_writes);
747 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 702 submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
748 struct bio *rbio; 703 bio);
749 rw |= REQ_HARDBARRIER;
750 rbio = bio_clone(bio, GFP_NOIO);
751 rbio->bi_private = bio;
752 rbio->bi_end_io = super_written_barrier;
753 submit_bio(rw, rbio);
754 } else
755 submit_bio(rw, bio);
756} 704}
757 705
758void md_super_wait(mddev_t *mddev) 706void md_super_wait(mddev_t *mddev)
759{ 707{
760 /* wait for all superblock writes that were scheduled to complete. 708 /* wait for all superblock writes that were scheduled to complete */
761 * if any had to be retried (due to BARRIER problems), retry them
762 */
763 DEFINE_WAIT(wq); 709 DEFINE_WAIT(wq);
764 for(;;) { 710 for(;;) {
765 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 711 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
766 if (atomic_read(&mddev->pending_writes)==0) 712 if (atomic_read(&mddev->pending_writes)==0)
767 break; 713 break;
768 while (mddev->biolist) {
769 struct bio *bio;
770 spin_lock_irq(&mddev->write_lock);
771 bio = mddev->biolist;
772 mddev->biolist = bio->bi_next ;
773 bio->bi_next = NULL;
774 spin_unlock_irq(&mddev->write_lock);
775 submit_bio(bio->bi_rw, bio);
776 }
777 schedule(); 714 schedule();
778 } 715 }
779 finish_wait(&mddev->sb_wait, &wq); 716 finish_wait(&mddev->sb_wait, &wq);
@@ -1070,7 +1007,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1070 clear_bit(Faulty, &rdev->flags); 1007 clear_bit(Faulty, &rdev->flags);
1071 clear_bit(In_sync, &rdev->flags); 1008 clear_bit(In_sync, &rdev->flags);
1072 clear_bit(WriteMostly, &rdev->flags); 1009 clear_bit(WriteMostly, &rdev->flags);
1073 clear_bit(BarriersNotsupp, &rdev->flags);
1074 1010
1075 if (mddev->raid_disks == 0) { 1011 if (mddev->raid_disks == 0) {
1076 mddev->major_version = 0; 1012 mddev->major_version = 0;
@@ -1485,7 +1421,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1485 clear_bit(Faulty, &rdev->flags); 1421 clear_bit(Faulty, &rdev->flags);
1486 clear_bit(In_sync, &rdev->flags); 1422 clear_bit(In_sync, &rdev->flags);
1487 clear_bit(WriteMostly, &rdev->flags); 1423 clear_bit(WriteMostly, &rdev->flags);
1488 clear_bit(BarriersNotsupp, &rdev->flags);
1489 1424
1490 if (mddev->raid_disks == 0) { 1425 if (mddev->raid_disks == 0) {
1491 mddev->major_version = 1; 1426 mddev->major_version = 1;
@@ -4506,7 +4441,6 @@ int md_run(mddev_t *mddev)
4506 /* may be over-ridden by personality */ 4441 /* may be over-ridden by personality */
4507 mddev->resync_max_sectors = mddev->dev_sectors; 4442 mddev->resync_max_sectors = mddev->dev_sectors;
4508 4443
4509 mddev->barriers_work = 1;
4510 mddev->ok_start_degraded = start_dirty_degraded; 4444 mddev->ok_start_degraded = start_dirty_degraded;
4511 4445
4512 if (start_readonly && mddev->ro == 0) 4446 if (start_readonly && mddev->ro == 0)
@@ -4685,7 +4619,6 @@ static void md_clean(mddev_t *mddev)
4685 mddev->recovery = 0; 4619 mddev->recovery = 0;
4686 mddev->in_sync = 0; 4620 mddev->in_sync = 0;
4687 mddev->degraded = 0; 4621 mddev->degraded = 0;
4688 mddev->barriers_work = 0;
4689 mddev->safemode = 0; 4622 mddev->safemode = 0;
4690 mddev->bitmap_info.offset = 0; 4623 mddev->bitmap_info.offset = 0;
4691 mddev->bitmap_info.default_offset = 0; 4624 mddev->bitmap_info.default_offset = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a953fe2808ae..d8e2ab25103b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -87,7 +87,6 @@ struct mdk_rdev_s
87#define Faulty 1 /* device is known to have a fault */ 87#define Faulty 1 /* device is known to have a fault */
88#define In_sync 2 /* device is in_sync with rest of array */ 88#define In_sync 2 /* device is in_sync with rest of array */
89#define WriteMostly 4 /* Avoid reading if at all possible */ 89#define WriteMostly 4 /* Avoid reading if at all possible */
90#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */
91#define AllReserved 6 /* If whole device is reserved for 90#define AllReserved 6 /* If whole device is reserved for
92 * one array */ 91 * one array */
93#define AutoDetected 7 /* added by auto-detect */ 92#define AutoDetected 7 /* added by auto-detect */
@@ -273,13 +272,6 @@ struct mddev_s
273 int degraded; /* whether md should consider 272 int degraded; /* whether md should consider
274 * adding a spare 273 * adding a spare
275 */ 274 */
276 int barriers_work; /* initialised to true, cleared as soon
277 * as a barrier request to slave
278 * fails. Only supported
279 */
280 struct bio *biolist; /* bios that need to be retried
281 * because REQ_HARDBARRIER is not supported
282 */
283 275
284 atomic_t recovery_active; /* blocks scheduled, but not written */ 276 atomic_t recovery_active; /* blocks scheduled, but not written */
285 wait_queue_head_t recovery_wait; 277 wait_queue_head_t recovery_wait;
@@ -339,16 +331,13 @@ struct mddev_s
339 struct attribute_group *to_remove; 331 struct attribute_group *to_remove;
340 struct plug_handle *plug; /* if used by personality */ 332 struct plug_handle *plug; /* if used by personality */
341 333
342 /* Generic barrier handling. 334 /* Generic flush handling.
343 * If there is a pending barrier request, all other 335 * The last to finish preflush schedules a worker to submit
344 * writes are blocked while the devices are flushed. 336 * the rest of the request (without the REQ_FLUSH flag).
345 * The last to finish a flush schedules a worker to
346 * submit the barrier request (without the barrier flag),
347 * then submit more flush requests.
348 */ 337 */
349 struct bio *barrier; 338 struct bio *flush_bio;
350 atomic_t flush_pending; 339 atomic_t flush_pending;
351 struct work_struct barrier_work; 340 struct work_struct flush_work;
352 struct work_struct event_work; /* used by dm to report failure event */ 341 struct work_struct event_work; /* used by dm to report failure event */
353}; 342};
354 343
@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
502extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 491extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
503 492
504extern int mddev_congested(mddev_t *mddev, int bits); 493extern int mddev_congested(mddev_t *mddev, int bits);
505extern void md_barrier_request(mddev_t *mddev, struct bio *bio); 494extern void md_flush_request(mddev_t *mddev, struct bio *bio);
506extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 495extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
507 sector_t sector, int size, struct page *page); 496 sector_t sector, int size, struct page *page);
508extern void md_super_wait(mddev_t *mddev); 497extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0307d217e7a4..6d7ddf32ef2e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
142 struct multipath_bh * mp_bh; 142 struct multipath_bh * mp_bh;
143 struct multipath_info *multipath; 143 struct multipath_info *multipath;
144 144
145 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 145 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
146 md_barrier_request(mddev, bio); 146 md_flush_request(mddev, bio);
147 return 0; 147 return 0;
148 } 148 }
149 149
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f7af46d623c..a39f4c355e55 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
483 struct strip_zone *zone; 483 struct strip_zone *zone;
484 mdk_rdev_t *tmp_dev; 484 mdk_rdev_t *tmp_dev;
485 485
486 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 486 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
487 md_barrier_request(mddev, bio); 487 md_flush_request(mddev, bio);
488 return 0; 488 return 0;
489 } 489 }
490 490
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ad83a4dcadc3..886a9d865488 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
319 if (r1_bio->bios[mirror] == bio) 319 if (r1_bio->bios[mirror] == bio)
320 break; 320 break;
321 321
322 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { 322 /*
323 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); 323 * 'one mirror IO has finished' event handler:
324 set_bit(R1BIO_BarrierRetry, &r1_bio->state); 324 */
325 r1_bio->mddev->barriers_work = 0; 325 r1_bio->bios[mirror] = NULL;
326 /* Don't rdev_dec_pending in this branch - keep it for the retry */ 326 to_put = bio;
327 } else { 327 if (!uptodate) {
328 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
329 /* an I/O failed, we can't clear the bitmap */
330 set_bit(R1BIO_Degraded, &r1_bio->state);
331 } else
328 /* 332 /*
329 * this branch is our 'one mirror IO has finished' event handler: 333 * Set R1BIO_Uptodate in our master bio, so that we
334 * will return a good error code for to the higher
335 * levels even if IO on some other mirrored buffer
336 * fails.
337 *
338 * The 'master' represents the composite IO operation
339 * to user-side. So if something waits for IO, then it
340 * will wait for the 'master' bio.
330 */ 341 */
331 r1_bio->bios[mirror] = NULL; 342 set_bit(R1BIO_Uptodate, &r1_bio->state);
332 to_put = bio; 343
333 if (!uptodate) { 344 update_head_pos(mirror, r1_bio);
334 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 345
335 /* an I/O failed, we can't clear the bitmap */ 346 if (behind) {
336 set_bit(R1BIO_Degraded, &r1_bio->state); 347 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
337 } else 348 atomic_dec(&r1_bio->behind_remaining);
338 /* 349
339 * Set R1BIO_Uptodate in our master bio, so that 350 /*
340 * we will return a good error code for to the higher 351 * In behind mode, we ACK the master bio once the I/O
341 * levels even if IO on some other mirrored buffer fails. 352 * has safely reached all non-writemostly
342 * 353 * disks. Setting the Returned bit ensures that this
343 * The 'master' represents the composite IO operation to 354 * gets done only once -- we don't ever want to return
344 * user-side. So if something waits for IO, then it will 355 * -EIO here, instead we'll wait
345 * wait for the 'master' bio. 356 */
346 */ 357 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
347 set_bit(R1BIO_Uptodate, &r1_bio->state); 358 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
348 359 /* Maybe we can return now */
349 update_head_pos(mirror, r1_bio); 360 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
350 361 struct bio *mbio = r1_bio->master_bio;
351 if (behind) { 362 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
352 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 363 (unsigned long long) mbio->bi_sector,
353 atomic_dec(&r1_bio->behind_remaining); 364 (unsigned long long) mbio->bi_sector +
354 365 (mbio->bi_size >> 9) - 1);
355 /* In behind mode, we ACK the master bio once the I/O has safely 366 bio_endio(mbio, 0);
356 * reached all non-writemostly disks. Setting the Returned bit
357 * ensures that this gets done only once -- we don't ever want to
358 * return -EIO here, instead we'll wait */
359
360 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
361 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
362 /* Maybe we can return now */
363 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
364 struct bio *mbio = r1_bio->master_bio;
365 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
366 (unsigned long long) mbio->bi_sector,
367 (unsigned long long) mbio->bi_sector +
368 (mbio->bi_size >> 9) - 1);
369 bio_endio(mbio, 0);
370 }
371 } 367 }
372 } 368 }
373 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
374 } 369 }
370 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
371
375 /* 372 /*
376 *
377 * Let's see if all mirrored write operations have finished 373 * Let's see if all mirrored write operations have finished
378 * already. 374 * already.
379 */ 375 */
380 if (atomic_dec_and_test(&r1_bio->remaining)) { 376 if (atomic_dec_and_test(&r1_bio->remaining)) {
381 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) 377 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
382 reschedule_retry(r1_bio); 378 /* free extra copy of the data pages */
383 else { 379 int i = bio->bi_vcnt;
384 /* it really is the end of this request */ 380 while (i--)
385 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 381 safe_put_page(bio->bi_io_vec[i].bv_page);
386 /* free extra copy of the data pages */
387 int i = bio->bi_vcnt;
388 while (i--)
389 safe_put_page(bio->bi_io_vec[i].bv_page);
390 }
391 /* clear the bitmap if all writes complete successfully */
392 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
393 r1_bio->sectors,
394 !test_bit(R1BIO_Degraded, &r1_bio->state),
395 behind);
396 md_write_end(r1_bio->mddev);
397 raid_end_bio_io(r1_bio);
398 } 382 }
383 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
385 r1_bio->sectors,
386 !test_bit(R1BIO_Degraded, &r1_bio->state),
387 behind);
388 md_write_end(r1_bio->mddev);
389 raid_end_bio_io(r1_bio);
399 } 390 }
400 391
401 if (to_put) 392 if (to_put)
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
788 struct page **behind_pages = NULL; 779 struct page **behind_pages = NULL;
789 const int rw = bio_data_dir(bio); 780 const int rw = bio_data_dir(bio);
790 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 781 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
791 unsigned long do_barriers; 782 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
792 mdk_rdev_t *blocked_rdev; 783 mdk_rdev_t *blocked_rdev;
793 784
794 /* 785 /*
795 * Register the new request and wait if the reconstruction 786 * Register the new request and wait if the reconstruction
796 * thread has put up a bar for new requests. 787 * thread has put up a bar for new requests.
797 * Continue immediately if no resync is active currently. 788 * Continue immediately if no resync is active currently.
798 * We test barriers_work *after* md_write_start as md_write_start
799 * may cause the first superblock write, and that will check out
800 * if barriers work.
801 */ 789 */
802 790
803 md_write_start(mddev, bio); /* wait on superblock update early */ 791 md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 } 809 }
822 finish_wait(&conf->wait_barrier, &w); 810 finish_wait(&conf->wait_barrier, &w);
823 } 811 }
824 if (unlikely(!mddev->barriers_work &&
825 (bio->bi_rw & REQ_HARDBARRIER))) {
826 if (rw == WRITE)
827 md_write_end(mddev);
828 bio_endio(bio, -EOPNOTSUPP);
829 return 0;
830 }
831 812
832 wait_barrier(conf); 813 wait_barrier(conf);
833 814
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
959 atomic_set(&r1_bio->remaining, 0); 940 atomic_set(&r1_bio->remaining, 0);
960 atomic_set(&r1_bio->behind_remaining, 0); 941 atomic_set(&r1_bio->behind_remaining, 0);
961 942
962 do_barriers = bio->bi_rw & REQ_HARDBARRIER;
963 if (do_barriers)
964 set_bit(R1BIO_Barrier, &r1_bio->state);
965
966 bio_list_init(&bl); 943 bio_list_init(&bl);
967 for (i = 0; i < disks; i++) { 944 for (i = 0; i < disks; i++) {
968 struct bio *mbio; 945 struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
975 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 952 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
976 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 953 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
977 mbio->bi_end_io = raid1_end_write_request; 954 mbio->bi_end_io = raid1_end_write_request;
978 mbio->bi_rw = WRITE | do_barriers | do_sync; 955 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
979 mbio->bi_private = r1_bio; 956 mbio->bi_private = r1_bio;
980 957
981 if (behind_pages) { 958 if (behind_pages) {
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
1634 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1611 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1635 sync_request_write(mddev, r1_bio); 1612 sync_request_write(mddev, r1_bio);
1636 unplug = 1; 1613 unplug = 1;
1637 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1638 /* some requests in the r1bio were REQ_HARDBARRIER
1639 * requests which failed with -EOPNOTSUPP. Hohumm..
1640 * Better resubmit without the barrier.
1641 * We know which devices to resubmit for, because
1642 * all others have had their bios[] entry cleared.
1643 * We already have a nr_pending reference on these rdevs.
1644 */
1645 int i;
1646 const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
1647 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1648 clear_bit(R1BIO_Barrier, &r1_bio->state);
1649 for (i=0; i < conf->raid_disks; i++)
1650 if (r1_bio->bios[i])
1651 atomic_inc(&r1_bio->remaining);
1652 for (i=0; i < conf->raid_disks; i++)
1653 if (r1_bio->bios[i]) {
1654 struct bio_vec *bvec;
1655 int j;
1656
1657 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1658 /* copy pages from the failed bio, as
1659 * this might be a write-behind device */
1660 __bio_for_each_segment(bvec, bio, j, 0)
1661 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1662 bio_put(r1_bio->bios[i]);
1663 bio->bi_sector = r1_bio->sector +
1664 conf->mirrors[i].rdev->data_offset;
1665 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1666 bio->bi_end_io = raid1_end_write_request;
1667 bio->bi_rw = WRITE | do_sync;
1668 bio->bi_private = r1_bio;
1669 r1_bio->bios[i] = bio;
1670 generic_make_request(bio);
1671 }
1672 } else { 1614 } else {
1673 int disk; 1615 int disk;
1674 1616
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5f2d443ae28a..adf8cfd73313 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -117,8 +117,6 @@ struct r1bio_s {
117#define R1BIO_IsSync 1 117#define R1BIO_IsSync 1
118#define R1BIO_Degraded 2 118#define R1BIO_Degraded 2
119#define R1BIO_BehindIO 3 119#define R1BIO_BehindIO 3
120#define R1BIO_Barrier 4
121#define R1BIO_BarrierRetry 5
122/* For write-behind requests, we call bi_end_io when 120/* For write-behind requests, we call bi_end_io when
123 * the last non-write-behind device completes, providing 121 * the last non-write-behind device completes, providing
124 * any write was successful. Otherwise we call when 122 * any write was successful. Otherwise we call when
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124d..f0d082f749be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -800,12 +800,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
800 int chunk_sects = conf->chunk_mask + 1; 800 int chunk_sects = conf->chunk_mask + 1;
801 const int rw = bio_data_dir(bio); 801 const int rw = bio_data_dir(bio);
802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
803 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
803 struct bio_list bl; 804 struct bio_list bl;
804 unsigned long flags; 805 unsigned long flags;
805 mdk_rdev_t *blocked_rdev; 806 mdk_rdev_t *blocked_rdev;
806 807
807 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 808 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
808 md_barrier_request(mddev, bio); 809 md_flush_request(mddev, bio);
809 return 0; 810 return 0;
810 } 811 }
811 812
@@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
965 conf->mirrors[d].rdev->data_offset; 966 conf->mirrors[d].rdev->data_offset;
966 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 967 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
967 mbio->bi_end_io = raid10_end_write_request; 968 mbio->bi_end_io = raid10_end_write_request;
968 mbio->bi_rw = WRITE | do_sync; 969 mbio->bi_rw = WRITE | do_sync | do_fua;
969 mbio->bi_private = r10_bio; 970 mbio->bi_private = r10_bio;
970 971
971 atomic_inc(&r10_bio->remaining); 972 atomic_inc(&r10_bio->remaining);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69b0a169e43d..31140d1259dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
506 int rw; 506 int rw;
507 struct bio *bi; 507 struct bio *bi;
508 mdk_rdev_t *rdev; 508 mdk_rdev_t *rdev;
509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
510 rw = WRITE; 510 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
511 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 511 rw = WRITE_FUA;
512 else
513 rw = WRITE;
514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
512 rw = READ; 515 rw = READ;
513 else 516 else
514 continue; 517 continue;
@@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1031 1034
1032 while (wbi && wbi->bi_sector < 1035 while (wbi && wbi->bi_sector <
1033 dev->sector + STRIPE_SECTORS) { 1036 dev->sector + STRIPE_SECTORS) {
1037 if (wbi->bi_rw & REQ_FUA)
1038 set_bit(R5_WantFUA, &dev->flags);
1034 tx = async_copy_data(1, wbi, dev->page, 1039 tx = async_copy_data(1, wbi, dev->page,
1035 dev->sector, tx); 1040 dev->sector, tx);
1036 wbi = r5_next_bio(wbi, dev->sector); 1041 wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1048 int pd_idx = sh->pd_idx; 1053 int pd_idx = sh->pd_idx;
1049 int qd_idx = sh->qd_idx; 1054 int qd_idx = sh->qd_idx;
1050 int i; 1055 int i;
1056 bool fua = false;
1051 1057
1052 pr_debug("%s: stripe %llu\n", __func__, 1058 pr_debug("%s: stripe %llu\n", __func__,
1053 (unsigned long long)sh->sector); 1059 (unsigned long long)sh->sector);
1054 1060
1061 for (i = disks; i--; )
1062 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1063
1055 for (i = disks; i--; ) { 1064 for (i = disks; i--; ) {
1056 struct r5dev *dev = &sh->dev[i]; 1065 struct r5dev *dev = &sh->dev[i];
1057 1066
1058 if (dev->written || i == pd_idx || i == qd_idx) 1067 if (dev->written || i == pd_idx || i == qd_idx) {
1059 set_bit(R5_UPTODATE, &dev->flags); 1068 set_bit(R5_UPTODATE, &dev->flags);
1069 if (fua)
1070 set_bit(R5_WantFUA, &dev->flags);
1071 }
1060 } 1072 }
1061 1073
1062 if (sh->reconstruct_state == reconstruct_state_drain_run) 1074 if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh)
3281 3293
3282 if (dec_preread_active) { 3294 if (dec_preread_active) {
3283 /* We delay this until after ops_run_io so that if make_request 3295 /* We delay this until after ops_run_io so that if make_request
3284 * is waiting on a barrier, it won't continue until the writes 3296 * is waiting on a flush, it won't continue until the writes
3285 * have actually been submitted. 3297 * have actually been submitted.
3286 */ 3298 */
3287 atomic_dec(&conf->preread_active_stripes); 3299 atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh)
3583 3595
3584 if (dec_preread_active) { 3596 if (dec_preread_active) {
3585 /* We delay this until after ops_run_io so that if make_request 3597 /* We delay this until after ops_run_io so that if make_request
3586 * is waiting on a barrier, it won't continue until the writes 3598 * is waiting on a flush, it won't continue until the writes
3587 * have actually been submitted. 3599 * have actually been submitted.
3588 */ 3600 */
3589 atomic_dec(&conf->preread_active_stripes); 3601 atomic_dec(&conf->preread_active_stripes);
@@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi)
3978 const int rw = bio_data_dir(bi); 3990 const int rw = bio_data_dir(bi);
3979 int remaining; 3991 int remaining;
3980 3992
3981 if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { 3993 if (unlikely(bi->bi_rw & REQ_FLUSH)) {
3982 /* Drain all pending writes. We only really need 3994 md_flush_request(mddev, bi);
3983 * to ensure they have been submitted, but this is
3984 * easier.
3985 */
3986 mddev->pers->quiesce(mddev, 1);
3987 mddev->pers->quiesce(mddev, 0);
3988 md_barrier_request(mddev, bi);
3989 return 0; 3995 return 0;
3990 } 3996 }
3991 3997
@@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4103 finish_wait(&conf->wait_for_overlap, &w); 4109 finish_wait(&conf->wait_for_overlap, &w);
4104 set_bit(STRIPE_HANDLE, &sh->state); 4110 set_bit(STRIPE_HANDLE, &sh->state);
4105 clear_bit(STRIPE_DELAYED, &sh->state); 4111 clear_bit(STRIPE_DELAYED, &sh->state);
4106 if (mddev->barrier && 4112 if ((bi->bi_rw & REQ_SYNC) &&
4107 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4113 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4108 atomic_inc(&conf->preread_active_stripes); 4114 atomic_inc(&conf->preread_active_stripes);
4109 release_stripe(sh); 4115 release_stripe(sh);
@@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4126 bio_endio(bi, 0); 4132 bio_endio(bi, 0);
4127 } 4133 }
4128 4134
4129 if (mddev->barrier) {
4130 /* We need to wait for the stripes to all be handled.
4131 * So: wait for preread_active_stripes to drop to 0.
4132 */
4133 wait_event(mddev->thread->wqueue,
4134 atomic_read(&conf->preread_active_stripes) == 0);
4135 }
4136 return 0; 4135 return 0;
4137} 4136}
4138 4137
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 36eaed5dfd6e..2ace0582b409 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -275,6 +275,7 @@ struct r6_state {
275 * filling 275 * filling
276 */ 276 */
277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
278#define R5_WantFUA 14 /* Write should be FUA */
278/* 279/*
279 * Write method 280 * Write method
280 */ 281 */