aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2005-11-09 00:39:34 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-09 10:56:38 -0500
commita9701a30470856408d08657eb1bd7ae29a146190 (patch)
treeeb6ea8c82fdc1b50bf56abadeee63a935034cf27
parentbd926c63b7a6843d3ce2728396c0891e54fce5c4 (diff)
[PATCH] md: support BIO_RW_BARRIER for md/raid1
We can only accept BARRIER requests if all slaves handle barriers, and that can, of course, change with time.... So we keep track of whether the whole array seems safe for barriers, and also whether each individual rdev handles barriers. We initially assumes barriers are OK. When writing the superblock we try a barrier, and if that fails, we flag things for no-barriers. This will usually clear the flags fairly quickly. If writing the superblock finds that BIO_RW_BARRIER is -ENOTSUPP, we need to resubmit, so introduce function "md_super_wait" which waits for requests to finish, and retries ENOTSUPP requests without the barrier flag. When writing the real raid1, write requests which were BIO_RW_BARRIER but which aresn't supported need to be retried. So raid1d is enhanced to do this, and when any bio write completes (i.e. no retry needed) we remove it from the r1bio, so that devices needing retry are easy to find. We should hardly ever get -ENOTSUPP errors when writing data to the raid. It should only happen if: 1/ the device used to support BARRIER, but now doesn't. Few devices change like this, though raid1 can! or 2/ the array has no persistent superblock, so there was no opportunity to pre-test for barriers when writing the superblock. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/bitmap.c5
-rw-r--r--drivers/md/md.c99
-rw-r--r--drivers/md/raid1.c134
-rw-r--r--include/linux/raid/md.h1
-rw-r--r--include/linux/raid/md_k.h8
-rw-r--r--include/linux/raid/raid1.h4
6 files changed, 189 insertions, 62 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 220273e81ed6..51315302a85e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -301,7 +301,7 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai
301 page); 301 page);
302 302
303 if (wait) 303 if (wait)
304 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 304 md_super_wait(mddev);
305 return 0; 305 return 0;
306} 306}
307 307
@@ -828,8 +828,7 @@ int bitmap_unplug(struct bitmap *bitmap)
828 wake_up_process(bitmap->writeback_daemon->tsk)); 828 wake_up_process(bitmap->writeback_daemon->tsk));
829 spin_unlock_irq(&bitmap->write_lock); 829 spin_unlock_irq(&bitmap->write_lock);
830 } else 830 } else
831 wait_event(bitmap->mddev->sb_wait, 831 md_super_wait(bitmap->mddev);
832 atomic_read(&bitmap->mddev->pending_writes)==0);
833 } 832 }
834 return 0; 833 return 0;
835} 834}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index caa4add00c1b..199016932de5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -330,18 +330,46 @@ static void free_disk_sb(mdk_rdev_t * rdev)
330static int super_written(struct bio *bio, unsigned int bytes_done, int error) 330static int super_written(struct bio *bio, unsigned int bytes_done, int error)
331{ 331{
332 mdk_rdev_t *rdev = bio->bi_private; 332 mdk_rdev_t *rdev = bio->bi_private;
333 mddev_t *mddev = rdev->mddev;
333 if (bio->bi_size) 334 if (bio->bi_size)
334 return 1; 335 return 1;
335 336
336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 337 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
337 md_error(rdev->mddev, rdev); 338 md_error(mddev, rdev);
338 339
339 if (atomic_dec_and_test(&rdev->mddev->pending_writes)) 340 if (atomic_dec_and_test(&mddev->pending_writes))
340 wake_up(&rdev->mddev->sb_wait); 341 wake_up(&mddev->sb_wait);
341 bio_put(bio); 342 bio_put(bio);
342 return 0; 343 return 0;
343} 344}
344 345
346static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
347{
348 struct bio *bio2 = bio->bi_private;
349 mdk_rdev_t *rdev = bio2->bi_private;
350 mddev_t *mddev = rdev->mddev;
351 if (bio->bi_size)
352 return 1;
353
354 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
355 error == -EOPNOTSUPP) {
356 unsigned long flags;
357 /* barriers don't appear to be supported :-( */
358 set_bit(BarriersNotsupp, &rdev->flags);
359 mddev->barriers_work = 0;
360 spin_lock_irqsave(&mddev->write_lock, flags);
361 bio2->bi_next = mddev->biolist;
362 mddev->biolist = bio2;
363 spin_unlock_irqrestore(&mddev->write_lock, flags);
364 wake_up(&mddev->sb_wait);
365 bio_put(bio);
366 return 0;
367 }
368 bio_put(bio2);
369 bio->bi_private = rdev;
370 return super_written(bio, bytes_done, error);
371}
372
345void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 373void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
346 sector_t sector, int size, struct page *page) 374 sector_t sector, int size, struct page *page)
347{ 375{
@@ -350,16 +378,54 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
350 * and decrement it on completion, waking up sb_wait 378 * and decrement it on completion, waking up sb_wait
351 * if zero is reached. 379 * if zero is reached.
352 * If an error occurred, call md_error 380 * If an error occurred, call md_error
381 *
382 * As we might need to resubmit the request if BIO_RW_BARRIER
383 * causes ENOTSUPP, we allocate a spare bio...
353 */ 384 */
354 struct bio *bio = bio_alloc(GFP_NOIO, 1); 385 struct bio *bio = bio_alloc(GFP_NOIO, 1);
386 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
355 387
356 bio->bi_bdev = rdev->bdev; 388 bio->bi_bdev = rdev->bdev;
357 bio->bi_sector = sector; 389 bio->bi_sector = sector;
358 bio_add_page(bio, page, size, 0); 390 bio_add_page(bio, page, size, 0);
359 bio->bi_private = rdev; 391 bio->bi_private = rdev;
360 bio->bi_end_io = super_written; 392 bio->bi_end_io = super_written;
393 bio->bi_rw = rw;
394
361 atomic_inc(&mddev->pending_writes); 395 atomic_inc(&mddev->pending_writes);
362 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); 396 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
397 struct bio *rbio;
398 rw |= (1<<BIO_RW_BARRIER);
399 rbio = bio_clone(bio, GFP_NOIO);
400 rbio->bi_private = bio;
401 rbio->bi_end_io = super_written_barrier;
402 submit_bio(rw, rbio);
403 } else
404 submit_bio(rw, bio);
405}
406
407void md_super_wait(mddev_t *mddev)
408{
409 /* wait for all superblock writes that were scheduled to complete.
410 * if any had to be retried (due to BARRIER problems), retry them
411 */
412 DEFINE_WAIT(wq);
413 for(;;) {
414 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
415 if (atomic_read(&mddev->pending_writes)==0)
416 break;
417 while (mddev->biolist) {
418 struct bio *bio;
419 spin_lock_irq(&mddev->write_lock);
420 bio = mddev->biolist;
421 mddev->biolist = bio->bi_next ;
422 bio->bi_next = NULL;
423 spin_unlock_irq(&mddev->write_lock);
424 submit_bio(bio->bi_rw, bio);
425 }
426 schedule();
427 }
428 finish_wait(&mddev->sb_wait, &wq);
363} 429}
364 430
365static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 431static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
@@ -1382,7 +1448,7 @@ static void md_update_sb(mddev_t * mddev)
1382 int sync_req; 1448 int sync_req;
1383 1449
1384repeat: 1450repeat:
1385 spin_lock(&mddev->write_lock); 1451 spin_lock_irq(&mddev->write_lock);
1386 sync_req = mddev->in_sync; 1452 sync_req = mddev->in_sync;
1387 mddev->utime = get_seconds(); 1453 mddev->utime = get_seconds();
1388 mddev->events ++; 1454 mddev->events ++;
@@ -1405,11 +1471,11 @@ repeat:
1405 */ 1471 */
1406 if (!mddev->persistent) { 1472 if (!mddev->persistent) {
1407 mddev->sb_dirty = 0; 1473 mddev->sb_dirty = 0;
1408 spin_unlock(&mddev->write_lock); 1474 spin_unlock_irq(&mddev->write_lock);
1409 wake_up(&mddev->sb_wait); 1475 wake_up(&mddev->sb_wait);
1410 return; 1476 return;
1411 } 1477 }
1412 spin_unlock(&mddev->write_lock); 1478 spin_unlock_irq(&mddev->write_lock);
1413 1479
1414 dprintk(KERN_INFO 1480 dprintk(KERN_INFO
1415 "md: updating %s RAID superblock on device (in sync %d)\n", 1481 "md: updating %s RAID superblock on device (in sync %d)\n",
@@ -1437,17 +1503,17 @@ repeat:
1437 /* only need to write one superblock... */ 1503 /* only need to write one superblock... */
1438 break; 1504 break;
1439 } 1505 }
1440 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1506 md_super_wait(mddev);
1441 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1507 /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1442 1508
1443 spin_lock(&mddev->write_lock); 1509 spin_lock_irq(&mddev->write_lock);
1444 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1510 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1445 /* have to write it out again */ 1511 /* have to write it out again */
1446 spin_unlock(&mddev->write_lock); 1512 spin_unlock_irq(&mddev->write_lock);
1447 goto repeat; 1513 goto repeat;
1448 } 1514 }
1449 mddev->sb_dirty = 0; 1515 mddev->sb_dirty = 0;
1450 spin_unlock(&mddev->write_lock); 1516 spin_unlock_irq(&mddev->write_lock);
1451 wake_up(&mddev->sb_wait); 1517 wake_up(&mddev->sb_wait);
1452 1518
1453} 1519}
@@ -1989,6 +2055,7 @@ static int do_md_run(mddev_t * mddev)
1989 2055
1990 mddev->recovery = 0; 2056 mddev->recovery = 0;
1991 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2057 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
2058 mddev->barriers_work = 1;
1992 2059
1993 /* before we start the array running, initialise the bitmap */ 2060 /* before we start the array running, initialise the bitmap */
1994 err = bitmap_create(mddev); 2061 err = bitmap_create(mddev);
@@ -2107,7 +2174,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2107 mddev->ro = 1; 2174 mddev->ro = 1;
2108 } else { 2175 } else {
2109 bitmap_flush(mddev); 2176 bitmap_flush(mddev);
2110 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 2177 md_super_wait(mddev);
2111 if (mddev->ro) 2178 if (mddev->ro)
2112 set_disk_ro(disk, 0); 2179 set_disk_ro(disk, 0);
2113 blk_queue_make_request(mddev->queue, md_fail_request); 2180 blk_queue_make_request(mddev->queue, md_fail_request);
@@ -3796,13 +3863,13 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
3796 3863
3797 atomic_inc(&mddev->writes_pending); 3864 atomic_inc(&mddev->writes_pending);
3798 if (mddev->in_sync) { 3865 if (mddev->in_sync) {
3799 spin_lock(&mddev->write_lock); 3866 spin_lock_irq(&mddev->write_lock);
3800 if (mddev->in_sync) { 3867 if (mddev->in_sync) {
3801 mddev->in_sync = 0; 3868 mddev->in_sync = 0;
3802 mddev->sb_dirty = 1; 3869 mddev->sb_dirty = 1;
3803 md_wakeup_thread(mddev->thread); 3870 md_wakeup_thread(mddev->thread);
3804 } 3871 }
3805 spin_unlock(&mddev->write_lock); 3872 spin_unlock_irq(&mddev->write_lock);
3806 } 3873 }
3807 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3874 wait_event(mddev->sb_wait, mddev->sb_dirty==0);
3808} 3875}
@@ -4112,7 +4179,7 @@ void md_check_recovery(mddev_t *mddev)
4112 if (mddev_trylock(mddev)==0) { 4179 if (mddev_trylock(mddev)==0) {
4113 int spares =0; 4180 int spares =0;
4114 4181
4115 spin_lock(&mddev->write_lock); 4182 spin_lock_irq(&mddev->write_lock);
4116 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 4183 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
4117 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 4184 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
4118 mddev->in_sync = 1; 4185 mddev->in_sync = 1;
@@ -4120,7 +4187,7 @@ void md_check_recovery(mddev_t *mddev)
4120 } 4187 }
4121 if (mddev->safemode == 1) 4188 if (mddev->safemode == 1)
4122 mddev->safemode = 0; 4189 mddev->safemode = 0;
4123 spin_unlock(&mddev->write_lock); 4190 spin_unlock_irq(&mddev->write_lock);
4124 4191
4125 if (mddev->sb_dirty) 4192 if (mddev->sb_dirty)
4126 md_update_sb(mddev); 4193 md_update_sb(mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb6b866c28f5..1cbf51fbd43f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
301{ 301{
302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
304 int mirror, behind; 304 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
305 conf_t *conf = mddev_to_conf(r1_bio->mddev); 305 conf_t *conf = mddev_to_conf(r1_bio->mddev);
306 306
307 if (bio->bi_size) 307 if (bio->bi_size)
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
311 if (r1_bio->bios[mirror] == bio) 311 if (r1_bio->bios[mirror] == bio)
312 break; 312 break;
313 313
314 /* 314 if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
315 * this branch is our 'one mirror IO has finished' event handler: 315 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
316 */ 316 set_bit(R1BIO_BarrierRetry, &r1_bio->state);
317 if (!uptodate) { 317 r1_bio->mddev->barriers_work = 0;
318 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 318 } else {
319 /* an I/O failed, we can't clear the bitmap */
320 set_bit(R1BIO_Degraded, &r1_bio->state);
321 } else
322 /* 319 /*
323 * Set R1BIO_Uptodate in our master bio, so that 320 * this branch is our 'one mirror IO has finished' event handler:
324 * we will return a good error code for to the higher
325 * levels even if IO on some other mirrored buffer fails.
326 *
327 * The 'master' represents the composite IO operation to
328 * user-side. So if something waits for IO, then it will
329 * wait for the 'master' bio.
330 */ 321 */
331 set_bit(R1BIO_Uptodate, &r1_bio->state); 322 r1_bio->bios[mirror] = NULL;
332 323 bio_put(bio);
333 update_head_pos(mirror, r1_bio); 324 if (!uptodate) {
334 325 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 326 /* an I/O failed, we can't clear the bitmap */
336 if (behind) { 327 set_bit(R1BIO_Degraded, &r1_bio->state);
337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 328 } else
338 atomic_dec(&r1_bio->behind_remaining); 329 /*
339 330 * Set R1BIO_Uptodate in our master bio, so that
340 /* In behind mode, we ACK the master bio once the I/O has safely 331 * we will return a good error code for to the higher
341 * reached all non-writemostly disks. Setting the Returned bit 332 * levels even if IO on some other mirrored buffer fails.
342 * ensures that this gets done only once -- we don't ever want to 333 *
343 * return -EIO here, instead we'll wait */ 334 * The 'master' represents the composite IO operation to
344 335 * user-side. So if something waits for IO, then it will
345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && 336 * wait for the 'master' bio.
346 test_bit(R1BIO_Uptodate, &r1_bio->state)) { 337 */
347 /* Maybe we can return now */ 338 set_bit(R1BIO_Uptodate, &r1_bio->state);
348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 339
349 struct bio *mbio = r1_bio->master_bio; 340 update_head_pos(mirror, r1_bio);
350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", 341
351 (unsigned long long) mbio->bi_sector, 342 if (behind) {
352 (unsigned long long) mbio->bi_sector + 343 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
353 (mbio->bi_size >> 9) - 1); 344 atomic_dec(&r1_bio->behind_remaining);
354 bio_endio(mbio, mbio->bi_size, 0); 345
346 /* In behind mode, we ACK the master bio once the I/O has safely
347 * reached all non-writemostly disks. Setting the Returned bit
348 * ensures that this gets done only once -- we don't ever want to
349 * return -EIO here, instead we'll wait */
350
351 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
352 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
353 /* Maybe we can return now */
354 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
355 struct bio *mbio = r1_bio->master_bio;
356 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
357 (unsigned long long) mbio->bi_sector,
358 (unsigned long long) mbio->bi_sector +
359 (mbio->bi_size >> 9) - 1);
360 bio_endio(mbio, mbio->bi_size, 0);
361 }
355 } 362 }
356 } 363 }
357 } 364 }
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
361 * already. 368 * already.
362 */ 369 */
363 if (atomic_dec_and_test(&r1_bio->remaining)) { 370 if (atomic_dec_and_test(&r1_bio->remaining)) {
371 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
372 reschedule_retry(r1_bio);
373 /* Don't dec_pending yet, we want to hold
374 * the reference over the retry
375 */
376 return 0;
377 }
364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 378 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
365 /* free extra copy of the data pages */ 379 /* free extra copy of the data pages */
380/* FIXME bio has been freed!!! */
366 int i = bio->bi_vcnt; 381 int i = bio->bi_vcnt;
367 while (i--) 382 while (i--)
368 __free_page(bio->bi_io_vec[i].bv_page); 383 __free_page(bio->bi_io_vec[i].bv_page);
@@ -648,8 +663,9 @@ static int make_request(request_queue_t *q, struct bio * bio)
648 struct bio_list bl; 663 struct bio_list bl;
649 struct page **behind_pages = NULL; 664 struct page **behind_pages = NULL;
650 const int rw = bio_data_dir(bio); 665 const int rw = bio_data_dir(bio);
666 int do_barriers;
651 667
652 if (unlikely(bio_barrier(bio))) { 668 if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
653 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 669 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
654 return 0; 670 return 0;
655 } 671 }
@@ -759,6 +775,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
759 atomic_set(&r1_bio->remaining, 0); 775 atomic_set(&r1_bio->remaining, 0);
760 atomic_set(&r1_bio->behind_remaining, 0); 776 atomic_set(&r1_bio->behind_remaining, 0);
761 777
778 do_barriers = bio->bi_rw & BIO_RW_BARRIER;
779 if (do_barriers)
780 set_bit(R1BIO_Barrier, &r1_bio->state);
781
762 bio_list_init(&bl); 782 bio_list_init(&bl);
763 for (i = 0; i < disks; i++) { 783 for (i = 0; i < disks; i++) {
764 struct bio *mbio; 784 struct bio *mbio;
@@ -771,7 +791,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
771 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 791 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
772 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 792 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
773 mbio->bi_end_io = raid1_end_write_request; 793 mbio->bi_end_io = raid1_end_write_request;
774 mbio->bi_rw = WRITE; 794 mbio->bi_rw = WRITE | do_barriers;
775 mbio->bi_private = r1_bio; 795 mbio->bi_private = r1_bio;
776 796
777 if (behind_pages) { 797 if (behind_pages) {
@@ -1153,6 +1173,36 @@ static void raid1d(mddev_t *mddev)
1153 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1173 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1154 sync_request_write(mddev, r1_bio); 1174 sync_request_write(mddev, r1_bio);
1155 unplug = 1; 1175 unplug = 1;
1176 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1177 /* some requests in the r1bio were BIO_RW_BARRIER
1178 * requests which failed with -ENOTSUPP. Hohumm..
1179 * Better resubmit without the barrier.
1180 * We know which devices to resubmit for, because
1181 * all others have had their bios[] entry cleared.
1182 */
1183 int i;
1184 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1185 clear_bit(R1BIO_Barrier, &r1_bio->state);
1186 for (i=0; i < conf->raid_disks; i++)
1187 if (r1_bio->bios[i]) {
1188 struct bio_vec *bvec;
1189 int j;
1190
1191 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1192 /* copy pages from the failed bio, as
1193 * this might be a write-behind device */
1194 __bio_for_each_segment(bvec, bio, j, 0)
1195 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1196 bio_put(r1_bio->bios[i]);
1197 bio->bi_sector = r1_bio->sector +
1198 conf->mirrors[i].rdev->data_offset;
1199 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1200 bio->bi_end_io = raid1_end_write_request;
1201 bio->bi_rw = WRITE;
1202 bio->bi_private = r1_bio;
1203 r1_bio->bios[i] = bio;
1204 generic_make_request(bio);
1205 }
1156 } else { 1206 } else {
1157 int disk; 1207 int disk;
1158 bio = r1_bio->bios[r1_bio->read_disk]; 1208 bio = r1_bio->bios[r1_bio->read_disk];
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 91467a3c4a52..13e7c4b62367 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -89,6 +89,7 @@ extern void md_print_devices (void);
89 89
90extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 90extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
91 sector_t sector, int size, struct page *page); 91 sector_t sector, int size, struct page *page);
92extern void md_super_wait(mddev_t *mddev);
92extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, 93extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
93 struct page *page, int rw); 94 struct page *page, int rw);
94 95
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 11629f92180a..d5854c2b2721 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -122,6 +122,7 @@ struct mdk_rdev_s
122#define Faulty 1 /* device is known to have a fault */ 122#define Faulty 1 /* device is known to have a fault */
123#define In_sync 2 /* device is in_sync with rest of array */ 123#define In_sync 2 /* device is in_sync with rest of array */
124#define WriteMostly 4 /* Avoid reading if at all possible */ 124#define WriteMostly 4 /* Avoid reading if at all possible */
125#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */
125 126
126 int desc_nr; /* descriptor index in the superblock */ 127 int desc_nr; /* descriptor index in the superblock */
127 int raid_disk; /* role of device in array */ 128 int raid_disk; /* role of device in array */
@@ -210,6 +211,13 @@ struct mddev_s
210 int degraded; /* whether md should consider 211 int degraded; /* whether md should consider
211 * adding a spare 212 * adding a spare
212 */ 213 */
214 int barriers_work; /* initialised to true, cleared as soon
215 * as a barrier request to slave
216 * fails. Only supported
217 */
218 struct bio *biolist; /* bios that need to be retried
219 * because BIO_RW_BARRIER is not supported
220 */
213 221
214 atomic_t recovery_active; /* blocks scheduled, but not written */ 222 atomic_t recovery_active; /* blocks scheduled, but not written */
215 wait_queue_head_t recovery_wait; 223 wait_queue_head_t recovery_wait;
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 60e19b667548..292b98f2b408 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -110,7 +110,9 @@ struct r1bio_s {
110#define R1BIO_Uptodate 0 110#define R1BIO_Uptodate 0
111#define R1BIO_IsSync 1 111#define R1BIO_IsSync 1
112#define R1BIO_Degraded 2 112#define R1BIO_Degraded 2
113#define R1BIO_BehindIO 3 113#define R1BIO_BehindIO 3
114#define R1BIO_Barrier 4
115#define R1BIO_BarrierRetry 5
114/* For write-behind requests, we call bi_end_io when 116/* For write-behind requests, we call bi_end_io when
115 * the last non-write-behind device completes, providing 117 * the last non-write-behind device completes, providing
116 * any write was successful. Otherwise we call when 118 * any write was successful. Otherwise we call when