diff options
author | NeilBrown <neilb@suse.de> | 2005-11-09 00:39:34 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-11-09 10:56:38 -0500 |
commit | a9701a30470856408d08657eb1bd7ae29a146190 (patch) | |
tree | eb6ea8c82fdc1b50bf56abadeee63a935034cf27 /drivers/md/raid1.c | |
parent | bd926c63b7a6843d3ce2728396c0891e54fce5c4 (diff) |
[PATCH] md: support BIO_RW_BARRIER for md/raid1
We can only accept BARRIER requests if all slaves handle
barriers, and that can, of course, change with time....
So we keep track of whether the whole array seems safe for barriers,
and also whether each individual rdev handles barriers.
We initially assumes barriers are OK.
When writing the superblock we try a barrier, and if that fails, we flag
things for no-barriers. This will usually clear the flags fairly quickly.
If writing the superblock finds that BIO_RW_BARRIER is -ENOTSUPP, we need to
resubmit, so introduce function "md_super_wait" which waits for requests to
finish, and retries ENOTSUPP requests without the barrier flag.
When writing the real raid1, write requests which were BIO_RW_BARRIER but
which aresn't supported need to be retried. So raid1d is enhanced to do this,
and when any bio write completes (i.e. no retry needed) we remove it from the
r1bio, so that devices needing retry are easy to find.
We should hardly ever get -ENOTSUPP errors when writing data to the raid.
It should only happen if:
1/ the device used to support BARRIER, but now doesn't. Few devices
change like this, though raid1 can!
or
2/ the array has no persistent superblock, so there was no opportunity to
pre-test for barriers when writing the superblock.
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 134 |
1 files changed, 92 insertions, 42 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fb6b866c28f5..1cbf51fbd43f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
301 | { | 301 | { |
302 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 302 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
303 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 303 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
304 | int mirror, behind; | 304 | int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); |
305 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 305 | conf_t *conf = mddev_to_conf(r1_bio->mddev); |
306 | 306 | ||
307 | if (bio->bi_size) | 307 | if (bio->bi_size) |
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
311 | if (r1_bio->bios[mirror] == bio) | 311 | if (r1_bio->bios[mirror] == bio) |
312 | break; | 312 | break; |
313 | 313 | ||
314 | /* | 314 | if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { |
315 | * this branch is our 'one mirror IO has finished' event handler: | 315 | set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); |
316 | */ | 316 | set_bit(R1BIO_BarrierRetry, &r1_bio->state); |
317 | if (!uptodate) { | 317 | r1_bio->mddev->barriers_work = 0; |
318 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 318 | } else { |
319 | /* an I/O failed, we can't clear the bitmap */ | ||
320 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
321 | } else | ||
322 | /* | 319 | /* |
323 | * Set R1BIO_Uptodate in our master bio, so that | 320 | * this branch is our 'one mirror IO has finished' event handler: |
324 | * we will return a good error code for to the higher | ||
325 | * levels even if IO on some other mirrored buffer fails. | ||
326 | * | ||
327 | * The 'master' represents the composite IO operation to | ||
328 | * user-side. So if something waits for IO, then it will | ||
329 | * wait for the 'master' bio. | ||
330 | */ | 321 | */ |
331 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 322 | r1_bio->bios[mirror] = NULL; |
332 | 323 | bio_put(bio); | |
333 | update_head_pos(mirror, r1_bio); | 324 | if (!uptodate) { |
334 | 325 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | |
335 | behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | 326 | /* an I/O failed, we can't clear the bitmap */ |
336 | if (behind) { | 327 | set_bit(R1BIO_Degraded, &r1_bio->state); |
337 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | 328 | } else |
338 | atomic_dec(&r1_bio->behind_remaining); | 329 | /* |
339 | 330 | * Set R1BIO_Uptodate in our master bio, so that | |
340 | /* In behind mode, we ACK the master bio once the I/O has safely | 331 | * we will return a good error code for to the higher |
341 | * reached all non-writemostly disks. Setting the Returned bit | 332 | * levels even if IO on some other mirrored buffer fails. |
342 | * ensures that this gets done only once -- we don't ever want to | 333 | * |
343 | * return -EIO here, instead we'll wait */ | 334 | * The 'master' represents the composite IO operation to |
344 | 335 | * user-side. So if something waits for IO, then it will | |
345 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | 336 | * wait for the 'master' bio. |
346 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 337 | */ |
347 | /* Maybe we can return now */ | 338 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
348 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | 339 | |
349 | struct bio *mbio = r1_bio->master_bio; | 340 | update_head_pos(mirror, r1_bio); |
350 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | 341 | |
351 | (unsigned long long) mbio->bi_sector, | 342 | if (behind) { |
352 | (unsigned long long) mbio->bi_sector + | 343 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) |
353 | (mbio->bi_size >> 9) - 1); | 344 | atomic_dec(&r1_bio->behind_remaining); |
354 | bio_endio(mbio, mbio->bi_size, 0); | 345 | |
346 | /* In behind mode, we ACK the master bio once the I/O has safely | ||
347 | * reached all non-writemostly disks. Setting the Returned bit | ||
348 | * ensures that this gets done only once -- we don't ever want to | ||
349 | * return -EIO here, instead we'll wait */ | ||
350 | |||
351 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
352 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
353 | /* Maybe we can return now */ | ||
354 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
355 | struct bio *mbio = r1_bio->master_bio; | ||
356 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
357 | (unsigned long long) mbio->bi_sector, | ||
358 | (unsigned long long) mbio->bi_sector + | ||
359 | (mbio->bi_size >> 9) - 1); | ||
360 | bio_endio(mbio, mbio->bi_size, 0); | ||
361 | } | ||
355 | } | 362 | } |
356 | } | 363 | } |
357 | } | 364 | } |
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
361 | * already. | 368 | * already. |
362 | */ | 369 | */ |
363 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 370 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
371 | if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { | ||
372 | reschedule_retry(r1_bio); | ||
373 | /* Don't dec_pending yet, we want to hold | ||
374 | * the reference over the retry | ||
375 | */ | ||
376 | return 0; | ||
377 | } | ||
364 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | 378 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { |
365 | /* free extra copy of the data pages */ | 379 | /* free extra copy of the data pages */ |
380 | /* FIXME bio has been freed!!! */ | ||
366 | int i = bio->bi_vcnt; | 381 | int i = bio->bi_vcnt; |
367 | while (i--) | 382 | while (i--) |
368 | __free_page(bio->bi_io_vec[i].bv_page); | 383 | __free_page(bio->bi_io_vec[i].bv_page); |
@@ -648,8 +663,9 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
648 | struct bio_list bl; | 663 | struct bio_list bl; |
649 | struct page **behind_pages = NULL; | 664 | struct page **behind_pages = NULL; |
650 | const int rw = bio_data_dir(bio); | 665 | const int rw = bio_data_dir(bio); |
666 | int do_barriers; | ||
651 | 667 | ||
652 | if (unlikely(bio_barrier(bio))) { | 668 | if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { |
653 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | 669 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); |
654 | return 0; | 670 | return 0; |
655 | } | 671 | } |
@@ -759,6 +775,10 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
759 | atomic_set(&r1_bio->remaining, 0); | 775 | atomic_set(&r1_bio->remaining, 0); |
760 | atomic_set(&r1_bio->behind_remaining, 0); | 776 | atomic_set(&r1_bio->behind_remaining, 0); |
761 | 777 | ||
778 | do_barriers = bio->bi_rw & BIO_RW_BARRIER; | ||
779 | if (do_barriers) | ||
780 | set_bit(R1BIO_Barrier, &r1_bio->state); | ||
781 | |||
762 | bio_list_init(&bl); | 782 | bio_list_init(&bl); |
763 | for (i = 0; i < disks; i++) { | 783 | for (i = 0; i < disks; i++) { |
764 | struct bio *mbio; | 784 | struct bio *mbio; |
@@ -771,7 +791,7 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
771 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 791 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; |
772 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 792 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
773 | mbio->bi_end_io = raid1_end_write_request; | 793 | mbio->bi_end_io = raid1_end_write_request; |
774 | mbio->bi_rw = WRITE; | 794 | mbio->bi_rw = WRITE | do_barriers; |
775 | mbio->bi_private = r1_bio; | 795 | mbio->bi_private = r1_bio; |
776 | 796 | ||
777 | if (behind_pages) { | 797 | if (behind_pages) { |
@@ -1153,6 +1173,36 @@ static void raid1d(mddev_t *mddev) | |||
1153 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { | 1173 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { |
1154 | sync_request_write(mddev, r1_bio); | 1174 | sync_request_write(mddev, r1_bio); |
1155 | unplug = 1; | 1175 | unplug = 1; |
1176 | } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { | ||
1177 | /* some requests in the r1bio were BIO_RW_BARRIER | ||
1178 | * requests which failed with -ENOTSUPP. Hohumm.. | ||
1179 | * Better resubmit without the barrier. | ||
1180 | * We know which devices to resubmit for, because | ||
1181 | * all others have had their bios[] entry cleared. | ||
1182 | */ | ||
1183 | int i; | ||
1184 | clear_bit(R1BIO_BarrierRetry, &r1_bio->state); | ||
1185 | clear_bit(R1BIO_Barrier, &r1_bio->state); | ||
1186 | for (i=0; i < conf->raid_disks; i++) | ||
1187 | if (r1_bio->bios[i]) { | ||
1188 | struct bio_vec *bvec; | ||
1189 | int j; | ||
1190 | |||
1191 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | ||
1192 | /* copy pages from the failed bio, as | ||
1193 | * this might be a write-behind device */ | ||
1194 | __bio_for_each_segment(bvec, bio, j, 0) | ||
1195 | bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; | ||
1196 | bio_put(r1_bio->bios[i]); | ||
1197 | bio->bi_sector = r1_bio->sector + | ||
1198 | conf->mirrors[i].rdev->data_offset; | ||
1199 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1200 | bio->bi_end_io = raid1_end_write_request; | ||
1201 | bio->bi_rw = WRITE; | ||
1202 | bio->bi_private = r1_bio; | ||
1203 | r1_bio->bios[i] = bio; | ||
1204 | generic_make_request(bio); | ||
1205 | } | ||
1156 | } else { | 1206 | } else { |
1157 | int disk; | 1207 | int disk; |
1158 | bio = r1_bio->bios[r1_bio->read_disk]; | 1208 | bio = r1_bio->bios[r1_bio->read_disk]; |