diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 134 |
1 files changed, 92 insertions, 42 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fb6b866c28f5..1cbf51fbd43f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
301 | { | 301 | { |
302 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 302 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
303 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 303 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
304 | int mirror, behind; | 304 | int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); |
305 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 305 | conf_t *conf = mddev_to_conf(r1_bio->mddev); |
306 | 306 | ||
307 | if (bio->bi_size) | 307 | if (bio->bi_size) |
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
311 | if (r1_bio->bios[mirror] == bio) | 311 | if (r1_bio->bios[mirror] == bio) |
312 | break; | 312 | break; |
313 | 313 | ||
314 | /* | 314 | if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { |
315 | * this branch is our 'one mirror IO has finished' event handler: | 315 | set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); |
316 | */ | 316 | set_bit(R1BIO_BarrierRetry, &r1_bio->state); |
317 | if (!uptodate) { | 317 | r1_bio->mddev->barriers_work = 0; |
318 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 318 | } else { |
319 | /* an I/O failed, we can't clear the bitmap */ | ||
320 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
321 | } else | ||
322 | /* | 319 | /* |
323 | * Set R1BIO_Uptodate in our master bio, so that | 320 | * this branch is our 'one mirror IO has finished' event handler: |
324 | * we will return a good error code for to the higher | ||
325 | * levels even if IO on some other mirrored buffer fails. | ||
326 | * | ||
327 | * The 'master' represents the composite IO operation to | ||
328 | * user-side. So if something waits for IO, then it will | ||
329 | * wait for the 'master' bio. | ||
330 | */ | 321 | */ |
331 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 322 | r1_bio->bios[mirror] = NULL; |
332 | 323 | bio_put(bio); | |
333 | update_head_pos(mirror, r1_bio); | 324 | if (!uptodate) { |
334 | 325 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | |
335 | behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | 326 | /* an I/O failed, we can't clear the bitmap */ |
336 | if (behind) { | 327 | set_bit(R1BIO_Degraded, &r1_bio->state); |
337 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | 328 | } else |
338 | atomic_dec(&r1_bio->behind_remaining); | 329 | /* |
339 | 330 | * Set R1BIO_Uptodate in our master bio, so that | |
340 | /* In behind mode, we ACK the master bio once the I/O has safely | 331 | * we will return a good error code for to the higher |
341 | * reached all non-writemostly disks. Setting the Returned bit | 332 | * levels even if IO on some other mirrored buffer fails. |
342 | * ensures that this gets done only once -- we don't ever want to | 333 | * |
343 | * return -EIO here, instead we'll wait */ | 334 | * The 'master' represents the composite IO operation to |
344 | 335 | * user-side. So if something waits for IO, then it will | |
345 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | 336 | * wait for the 'master' bio. |
346 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 337 | */ |
347 | /* Maybe we can return now */ | 338 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
348 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | 339 | |
349 | struct bio *mbio = r1_bio->master_bio; | 340 | update_head_pos(mirror, r1_bio); |
350 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | 341 | |
351 | (unsigned long long) mbio->bi_sector, | 342 | if (behind) { |
352 | (unsigned long long) mbio->bi_sector + | 343 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) |
353 | (mbio->bi_size >> 9) - 1); | 344 | atomic_dec(&r1_bio->behind_remaining); |
354 | bio_endio(mbio, mbio->bi_size, 0); | 345 | |
346 | /* In behind mode, we ACK the master bio once the I/O has safely | ||
347 | * reached all non-writemostly disks. Setting the Returned bit | ||
348 | * ensures that this gets done only once -- we don't ever want to | ||
349 | * return -EIO here, instead we'll wait */ | ||
350 | |||
351 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
352 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
353 | /* Maybe we can return now */ | ||
354 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
355 | struct bio *mbio = r1_bio->master_bio; | ||
356 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
357 | (unsigned long long) mbio->bi_sector, | ||
358 | (unsigned long long) mbio->bi_sector + | ||
359 | (mbio->bi_size >> 9) - 1); | ||
360 | bio_endio(mbio, mbio->bi_size, 0); | ||
361 | } | ||
355 | } | 362 | } |
356 | } | 363 | } |
357 | } | 364 | } |
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
361 | * already. | 368 | * already. |
362 | */ | 369 | */ |
363 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 370 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
371 | if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { | ||
372 | reschedule_retry(r1_bio); | ||
373 | /* Don't dec_pending yet, we want to hold | ||
374 | * the reference over the retry | ||
375 | */ | ||
376 | return 0; | ||
377 | } | ||
364 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | 378 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { |
365 | /* free extra copy of the data pages */ | 379 | /* free extra copy of the data pages */ |
380 | /* FIXME bio has been freed!!! */ | ||
366 | int i = bio->bi_vcnt; | 381 | int i = bio->bi_vcnt; |
367 | while (i--) | 382 | while (i--) |
368 | __free_page(bio->bi_io_vec[i].bv_page); | 383 | __free_page(bio->bi_io_vec[i].bv_page); |
@@ -648,8 +663,9 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
648 | struct bio_list bl; | 663 | struct bio_list bl; |
649 | struct page **behind_pages = NULL; | 664 | struct page **behind_pages = NULL; |
650 | const int rw = bio_data_dir(bio); | 665 | const int rw = bio_data_dir(bio); |
666 | int do_barriers; | ||
651 | 667 | ||
652 | if (unlikely(bio_barrier(bio))) { | 668 | if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { |
653 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | 669 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); |
654 | return 0; | 670 | return 0; |
655 | } | 671 | } |
@@ -759,6 +775,10 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
759 | atomic_set(&r1_bio->remaining, 0); | 775 | atomic_set(&r1_bio->remaining, 0); |
760 | atomic_set(&r1_bio->behind_remaining, 0); | 776 | atomic_set(&r1_bio->behind_remaining, 0); |
761 | 777 | ||
778 | do_barriers = bio->bi_rw & BIO_RW_BARRIER; | ||
779 | if (do_barriers) | ||
780 | set_bit(R1BIO_Barrier, &r1_bio->state); | ||
781 | |||
762 | bio_list_init(&bl); | 782 | bio_list_init(&bl); |
763 | for (i = 0; i < disks; i++) { | 783 | for (i = 0; i < disks; i++) { |
764 | struct bio *mbio; | 784 | struct bio *mbio; |
@@ -771,7 +791,7 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
771 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 791 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; |
772 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 792 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
773 | mbio->bi_end_io = raid1_end_write_request; | 793 | mbio->bi_end_io = raid1_end_write_request; |
774 | mbio->bi_rw = WRITE; | 794 | mbio->bi_rw = WRITE | do_barriers; |
775 | mbio->bi_private = r1_bio; | 795 | mbio->bi_private = r1_bio; |
776 | 796 | ||
777 | if (behind_pages) { | 797 | if (behind_pages) { |
@@ -1153,6 +1173,36 @@ static void raid1d(mddev_t *mddev) | |||
1153 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { | 1173 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { |
1154 | sync_request_write(mddev, r1_bio); | 1174 | sync_request_write(mddev, r1_bio); |
1155 | unplug = 1; | 1175 | unplug = 1; |
1176 | } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { | ||
1177 | /* some requests in the r1bio were BIO_RW_BARRIER | ||
1178 | * requests which failed with -ENOTSUPP. Hohumm.. | ||
1179 | * Better resubmit without the barrier. | ||
1180 | * We know which devices to resubmit for, because | ||
1181 | * all others have had their bios[] entry cleared. | ||
1182 | */ | ||
1183 | int i; | ||
1184 | clear_bit(R1BIO_BarrierRetry, &r1_bio->state); | ||
1185 | clear_bit(R1BIO_Barrier, &r1_bio->state); | ||
1186 | for (i=0; i < conf->raid_disks; i++) | ||
1187 | if (r1_bio->bios[i]) { | ||
1188 | struct bio_vec *bvec; | ||
1189 | int j; | ||
1190 | |||
1191 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | ||
1192 | /* copy pages from the failed bio, as | ||
1193 | * this might be a write-behind device */ | ||
1194 | __bio_for_each_segment(bvec, bio, j, 0) | ||
1195 | bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; | ||
1196 | bio_put(r1_bio->bios[i]); | ||
1197 | bio->bi_sector = r1_bio->sector + | ||
1198 | conf->mirrors[i].rdev->data_offset; | ||
1199 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1200 | bio->bi_end_io = raid1_end_write_request; | ||
1201 | bio->bi_rw = WRITE; | ||
1202 | bio->bi_private = r1_bio; | ||
1203 | r1_bio->bios[i] = bio; | ||
1204 | generic_make_request(bio); | ||
1205 | } | ||
1156 | } else { | 1206 | } else { |
1157 | int disk; | 1207 | int disk; |
1158 | bio = r1_bio->bios[r1_bio->read_disk]; | 1208 | bio = r1_bio->bios[r1_bio->read_disk]; |