aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2005-11-09 00:39:34 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-09 10:56:38 -0500
commita9701a30470856408d08657eb1bd7ae29a146190 (patch)
treeeb6ea8c82fdc1b50bf56abadeee63a935034cf27 /drivers/md/raid1.c
parentbd926c63b7a6843d3ce2728396c0891e54fce5c4 (diff)
[PATCH] md: support BIO_RW_BARRIER for md/raid1
We can only accept BARRIER requests if all slaves handle barriers, and that can, of course, change with time.... So we keep track of whether the whole array seems safe for barriers, and also whether each individual rdev handles barriers. We initially assumes barriers are OK. When writing the superblock we try a barrier, and if that fails, we flag things for no-barriers. This will usually clear the flags fairly quickly. If writing the superblock finds that BIO_RW_BARRIER is -ENOTSUPP, we need to resubmit, so introduce function "md_super_wait" which waits for requests to finish, and retries ENOTSUPP requests without the barrier flag. When writing the real raid1, write requests which were BIO_RW_BARRIER but which aresn't supported need to be retried. So raid1d is enhanced to do this, and when any bio write completes (i.e. no retry needed) we remove it from the r1bio, so that devices needing retry are easy to find. We should hardly ever get -ENOTSUPP errors when writing data to the raid. It should only happen if: 1/ the device used to support BARRIER, but now doesn't. Few devices change like this, though raid1 can! or 2/ the array has no persistent superblock, so there was no opportunity to pre-test for barriers when writing the superblock. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c134
1 files changed, 92 insertions, 42 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb6b866c28f5..1cbf51fbd43f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
301{ 301{
302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
304 int mirror, behind; 304 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
305 conf_t *conf = mddev_to_conf(r1_bio->mddev); 305 conf_t *conf = mddev_to_conf(r1_bio->mddev);
306 306
307 if (bio->bi_size) 307 if (bio->bi_size)
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
311 if (r1_bio->bios[mirror] == bio) 311 if (r1_bio->bios[mirror] == bio)
312 break; 312 break;
313 313
314 /* 314 if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
315 * this branch is our 'one mirror IO has finished' event handler: 315 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
316 */ 316 set_bit(R1BIO_BarrierRetry, &r1_bio->state);
317 if (!uptodate) { 317 r1_bio->mddev->barriers_work = 0;
318 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 318 } else {
319 /* an I/O failed, we can't clear the bitmap */
320 set_bit(R1BIO_Degraded, &r1_bio->state);
321 } else
322 /* 319 /*
323 * Set R1BIO_Uptodate in our master bio, so that 320 * this branch is our 'one mirror IO has finished' event handler:
324 * we will return a good error code for to the higher
325 * levels even if IO on some other mirrored buffer fails.
326 *
327 * The 'master' represents the composite IO operation to
328 * user-side. So if something waits for IO, then it will
329 * wait for the 'master' bio.
330 */ 321 */
331 set_bit(R1BIO_Uptodate, &r1_bio->state); 322 r1_bio->bios[mirror] = NULL;
332 323 bio_put(bio);
333 update_head_pos(mirror, r1_bio); 324 if (!uptodate) {
334 325 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 326 /* an I/O failed, we can't clear the bitmap */
336 if (behind) { 327 set_bit(R1BIO_Degraded, &r1_bio->state);
337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 328 } else
338 atomic_dec(&r1_bio->behind_remaining); 329 /*
339 330 * Set R1BIO_Uptodate in our master bio, so that
340 /* In behind mode, we ACK the master bio once the I/O has safely 331 * we will return a good error code for to the higher
341 * reached all non-writemostly disks. Setting the Returned bit 332 * levels even if IO on some other mirrored buffer fails.
342 * ensures that this gets done only once -- we don't ever want to 333 *
343 * return -EIO here, instead we'll wait */ 334 * The 'master' represents the composite IO operation to
344 335 * user-side. So if something waits for IO, then it will
345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && 336 * wait for the 'master' bio.
346 test_bit(R1BIO_Uptodate, &r1_bio->state)) { 337 */
347 /* Maybe we can return now */ 338 set_bit(R1BIO_Uptodate, &r1_bio->state);
348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 339
349 struct bio *mbio = r1_bio->master_bio; 340 update_head_pos(mirror, r1_bio);
350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", 341
351 (unsigned long long) mbio->bi_sector, 342 if (behind) {
352 (unsigned long long) mbio->bi_sector + 343 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
353 (mbio->bi_size >> 9) - 1); 344 atomic_dec(&r1_bio->behind_remaining);
354 bio_endio(mbio, mbio->bi_size, 0); 345
346 /* In behind mode, we ACK the master bio once the I/O has safely
347 * reached all non-writemostly disks. Setting the Returned bit
348 * ensures that this gets done only once -- we don't ever want to
349 * return -EIO here, instead we'll wait */
350
351 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
352 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
353 /* Maybe we can return now */
354 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
355 struct bio *mbio = r1_bio->master_bio;
356 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
357 (unsigned long long) mbio->bi_sector,
358 (unsigned long long) mbio->bi_sector +
359 (mbio->bi_size >> 9) - 1);
360 bio_endio(mbio, mbio->bi_size, 0);
361 }
355 } 362 }
356 } 363 }
357 } 364 }
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
361 * already. 368 * already.
362 */ 369 */
363 if (atomic_dec_and_test(&r1_bio->remaining)) { 370 if (atomic_dec_and_test(&r1_bio->remaining)) {
371 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
372 reschedule_retry(r1_bio);
373 /* Don't dec_pending yet, we want to hold
374 * the reference over the retry
375 */
376 return 0;
377 }
364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 378 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
365 /* free extra copy of the data pages */ 379 /* free extra copy of the data pages */
380/* FIXME bio has been freed!!! */
366 int i = bio->bi_vcnt; 381 int i = bio->bi_vcnt;
367 while (i--) 382 while (i--)
368 __free_page(bio->bi_io_vec[i].bv_page); 383 __free_page(bio->bi_io_vec[i].bv_page);
@@ -648,8 +663,9 @@ static int make_request(request_queue_t *q, struct bio * bio)
648 struct bio_list bl; 663 struct bio_list bl;
649 struct page **behind_pages = NULL; 664 struct page **behind_pages = NULL;
650 const int rw = bio_data_dir(bio); 665 const int rw = bio_data_dir(bio);
666 int do_barriers;
651 667
652 if (unlikely(bio_barrier(bio))) { 668 if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
653 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 669 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
654 return 0; 670 return 0;
655 } 671 }
@@ -759,6 +775,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
759 atomic_set(&r1_bio->remaining, 0); 775 atomic_set(&r1_bio->remaining, 0);
760 atomic_set(&r1_bio->behind_remaining, 0); 776 atomic_set(&r1_bio->behind_remaining, 0);
761 777
778 do_barriers = bio->bi_rw & BIO_RW_BARRIER;
779 if (do_barriers)
780 set_bit(R1BIO_Barrier, &r1_bio->state);
781
762 bio_list_init(&bl); 782 bio_list_init(&bl);
763 for (i = 0; i < disks; i++) { 783 for (i = 0; i < disks; i++) {
764 struct bio *mbio; 784 struct bio *mbio;
@@ -771,7 +791,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
771 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 791 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
772 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 792 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
773 mbio->bi_end_io = raid1_end_write_request; 793 mbio->bi_end_io = raid1_end_write_request;
774 mbio->bi_rw = WRITE; 794 mbio->bi_rw = WRITE | do_barriers;
775 mbio->bi_private = r1_bio; 795 mbio->bi_private = r1_bio;
776 796
777 if (behind_pages) { 797 if (behind_pages) {
@@ -1153,6 +1173,36 @@ static void raid1d(mddev_t *mddev)
1153 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1173 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1154 sync_request_write(mddev, r1_bio); 1174 sync_request_write(mddev, r1_bio);
1155 unplug = 1; 1175 unplug = 1;
1176 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1177 /* some requests in the r1bio were BIO_RW_BARRIER
1178 * requests which failed with -ENOTSUPP. Hohumm..
1179 * Better resubmit without the barrier.
1180 * We know which devices to resubmit for, because
1181 * all others have had their bios[] entry cleared.
1182 */
1183 int i;
1184 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1185 clear_bit(R1BIO_Barrier, &r1_bio->state);
1186 for (i=0; i < conf->raid_disks; i++)
1187 if (r1_bio->bios[i]) {
1188 struct bio_vec *bvec;
1189 int j;
1190
1191 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1192 /* copy pages from the failed bio, as
1193 * this might be a write-behind device */
1194 __bio_for_each_segment(bvec, bio, j, 0)
1195 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1196 bio_put(r1_bio->bios[i]);
1197 bio->bi_sector = r1_bio->sector +
1198 conf->mirrors[i].rdev->data_offset;
1199 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1200 bio->bi_end_io = raid1_end_write_request;
1201 bio->bi_rw = WRITE;
1202 bio->bi_private = r1_bio;
1203 r1_bio->bios[i] = bio;
1204 generic_make_request(bio);
1205 }
1156 } else { 1206 } else {
1157 int disk; 1207 int disk;
1158 bio = r1_bio->bios[r1_bio->read_disk]; 1208 bio = r1_bio->bios[r1_bio->read_disk];