diff options
author | NeilBrown <neilb@cse.unsw.edu.au> | 2005-09-09 19:23:47 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-09-09 19:39:10 -0400 |
commit | 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 (patch) | |
tree | 7b6cbc6a997e25a7fb6185da7129e539c4ffda8b /drivers/md/raid1.c | |
parent | 8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 (diff) |
[PATCH] md: add write-behind support for md/raid1
If a device is flagged 'WriteMostly' and the array has a bitmap, and the
bitmap superblock indicates that write_behind is allowed, then write_behind is
enabled for WriteMostly devices.
Write requests will be acknowledges as complete to the caller (via b_end_io)
when all non-WriteMostly devices have completed the write, but will not be
cleared from the bitmap until all devices complete.
This requires memory allocation to make a local copy of the data being
written. If there is insufficient memory, then we fall-back on normal write
semantics.
Signed-Off-By: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 124 |
1 files changed, 117 insertions, 7 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 28839a8193f2..ba7f5f256161 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
222 | { | 222 | { |
223 | struct bio *bio = r1_bio->master_bio; | 223 | struct bio *bio = r1_bio->master_bio; |
224 | 224 | ||
225 | bio_endio(bio, bio->bi_size, | 225 | /* if nobody has done the final endio yet, do it now */ |
226 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | 226 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
227 | PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", | ||
228 | (bio_data_dir(bio) == WRITE) ? "write" : "read", | ||
229 | (unsigned long long) bio->bi_sector, | ||
230 | (unsigned long long) bio->bi_sector + | ||
231 | (bio->bi_size >> 9) - 1); | ||
232 | |||
233 | bio_endio(bio, bio->bi_size, | ||
234 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | ||
235 | } | ||
227 | free_r1bio(r1_bio); | 236 | free_r1bio(r1_bio); |
228 | } | 237 | } |
229 | 238 | ||
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
292 | { | 301 | { |
293 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 302 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
294 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 303 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
295 | int mirror; | 304 | int mirror, behind; |
296 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 305 | conf_t *conf = mddev_to_conf(r1_bio->mddev); |
297 | 306 | ||
298 | if (bio->bi_size) | 307 | if (bio->bi_size) |
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
323 | 332 | ||
324 | update_head_pos(mirror, r1_bio); | 333 | update_head_pos(mirror, r1_bio); |
325 | 334 | ||
335 | behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | ||
336 | if (behind) { | ||
337 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | ||
338 | atomic_dec(&r1_bio->behind_remaining); | ||
339 | |||
340 | /* In behind mode, we ACK the master bio once the I/O has safely | ||
341 | * reached all non-writemostly disks. Setting the Returned bit | ||
342 | * ensures that this gets done only once -- we don't ever want to | ||
343 | * return -EIO here, instead we'll wait */ | ||
344 | |||
345 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
346 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
347 | /* Maybe we can return now */ | ||
348 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
349 | struct bio *mbio = r1_bio->master_bio; | ||
350 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
351 | (unsigned long long) mbio->bi_sector, | ||
352 | (unsigned long long) mbio->bi_sector + | ||
353 | (mbio->bi_size >> 9) - 1); | ||
354 | bio_endio(mbio, mbio->bi_size, 0); | ||
355 | } | ||
356 | } | ||
357 | } | ||
326 | /* | 358 | /* |
327 | * | 359 | * |
328 | * Let's see if all mirrored write operations have finished | 360 | * Let's see if all mirrored write operations have finished |
329 | * already. | 361 | * already. |
330 | */ | 362 | */ |
331 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 363 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
364 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
365 | /* free extra copy of the data pages */ | ||
366 | int i = bio->bi_vcnt; | ||
367 | while (i--) | ||
368 | __free_page(bio->bi_io_vec[i].bv_page); | ||
369 | } | ||
332 | /* clear the bitmap if all writes complete successfully */ | 370 | /* clear the bitmap if all writes complete successfully */ |
333 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | 371 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, |
334 | r1_bio->sectors, | 372 | r1_bio->sectors, |
335 | !test_bit(R1BIO_Degraded, &r1_bio->state)); | 373 | !test_bit(R1BIO_Degraded, &r1_bio->state), |
374 | behind); | ||
336 | md_write_end(r1_bio->mddev); | 375 | md_write_end(r1_bio->mddev); |
337 | raid_end_bio_io(r1_bio); | 376 | raid_end_bio_io(r1_bio); |
338 | } | 377 | } |
@@ -562,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect) | |||
562 | spin_unlock_irq(&conf->resync_lock); | 601 | spin_unlock_irq(&conf->resync_lock); |
563 | } | 602 | } |
564 | 603 | ||
604 | /* duplicate the data pages for behind I/O */ | ||
605 | static struct page **alloc_behind_pages(struct bio *bio) | ||
606 | { | ||
607 | int i; | ||
608 | struct bio_vec *bvec; | ||
609 | struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), | ||
610 | GFP_NOIO); | ||
611 | if (unlikely(!pages)) | ||
612 | goto do_sync_io; | ||
613 | |||
614 | memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); | ||
615 | |||
616 | bio_for_each_segment(bvec, bio, i) { | ||
617 | pages[i] = alloc_page(GFP_NOIO); | ||
618 | if (unlikely(!pages[i])) | ||
619 | goto do_sync_io; | ||
620 | memcpy(kmap(pages[i]) + bvec->bv_offset, | ||
621 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | ||
622 | kunmap(pages[i]); | ||
623 | kunmap(bvec->bv_page); | ||
624 | } | ||
625 | |||
626 | return pages; | ||
627 | |||
628 | do_sync_io: | ||
629 | if (pages) | ||
630 | for (i = 0; i < bio->bi_vcnt && pages[i]; i++) | ||
631 | __free_page(pages[i]); | ||
632 | kfree(pages); | ||
633 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | ||
634 | return NULL; | ||
635 | } | ||
636 | |||
565 | static int make_request(request_queue_t *q, struct bio * bio) | 637 | static int make_request(request_queue_t *q, struct bio * bio) |
566 | { | 638 | { |
567 | mddev_t *mddev = q->queuedata; | 639 | mddev_t *mddev = q->queuedata; |
@@ -574,6 +646,7 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
574 | struct bitmap *bitmap = mddev->bitmap; | 646 | struct bitmap *bitmap = mddev->bitmap; |
575 | unsigned long flags; | 647 | unsigned long flags; |
576 | struct bio_list bl; | 648 | struct bio_list bl; |
649 | struct page **behind_pages = NULL; | ||
577 | 650 | ||
578 | if (unlikely(bio_barrier(bio))) { | 651 | if (unlikely(bio_barrier(bio))) { |
579 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | 652 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); |
@@ -613,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
613 | r1_bio->mddev = mddev; | 686 | r1_bio->mddev = mddev; |
614 | r1_bio->sector = bio->bi_sector; | 687 | r1_bio->sector = bio->bi_sector; |
615 | 688 | ||
616 | r1_bio->state = 0; | ||
617 | |||
618 | if (bio_data_dir(bio) == READ) { | 689 | if (bio_data_dir(bio) == READ) { |
619 | /* | 690 | /* |
620 | * read balancing logic: | 691 | * read balancing logic: |
@@ -675,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
675 | } | 746 | } |
676 | rcu_read_unlock(); | 747 | rcu_read_unlock(); |
677 | 748 | ||
749 | BUG_ON(targets == 0); /* we never fail the last device */ | ||
750 | |||
678 | if (targets < conf->raid_disks) { | 751 | if (targets < conf->raid_disks) { |
679 | /* array is degraded, we will not clear the bitmap | 752 | /* array is degraded, we will not clear the bitmap |
680 | * on I/O completion (see raid1_end_write_request) */ | 753 | * on I/O completion (see raid1_end_write_request) */ |
681 | set_bit(R1BIO_Degraded, &r1_bio->state); | 754 | set_bit(R1BIO_Degraded, &r1_bio->state); |
682 | } | 755 | } |
683 | 756 | ||
757 | /* do behind I/O ? */ | ||
758 | if (bitmap && | ||
759 | atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && | ||
760 | (behind_pages = alloc_behind_pages(bio)) != NULL) | ||
761 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
762 | |||
684 | atomic_set(&r1_bio->remaining, 0); | 763 | atomic_set(&r1_bio->remaining, 0); |
764 | atomic_set(&r1_bio->behind_remaining, 0); | ||
685 | 765 | ||
686 | bio_list_init(&bl); | 766 | bio_list_init(&bl); |
687 | for (i = 0; i < disks; i++) { | 767 | for (i = 0; i < disks; i++) { |
@@ -698,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
698 | mbio->bi_rw = WRITE; | 778 | mbio->bi_rw = WRITE; |
699 | mbio->bi_private = r1_bio; | 779 | mbio->bi_private = r1_bio; |
700 | 780 | ||
781 | if (behind_pages) { | ||
782 | struct bio_vec *bvec; | ||
783 | int j; | ||
784 | |||
785 | /* Yes, I really want the '__' version so that | ||
786 | * we clear any unused pointer in the io_vec, rather | ||
787 | * than leave them unchanged. This is important | ||
788 | * because when we come to free the pages, we won't | ||
789 | * know the originial bi_idx, so we just free | ||
790 | * them all | ||
791 | */ | ||
792 | __bio_for_each_segment(bvec, mbio, j, 0) | ||
793 | bvec->bv_page = behind_pages[j]; | ||
794 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | ||
795 | atomic_inc(&r1_bio->behind_remaining); | ||
796 | } | ||
797 | |||
701 | atomic_inc(&r1_bio->remaining); | 798 | atomic_inc(&r1_bio->remaining); |
702 | 799 | ||
703 | bio_list_add(&bl, mbio); | 800 | bio_list_add(&bl, mbio); |
704 | } | 801 | } |
802 | kfree(behind_pages); /* the behind pages are attached to the bios now */ | ||
705 | 803 | ||
706 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); | 804 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, |
805 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
707 | spin_lock_irqsave(&conf->device_lock, flags); | 806 | spin_lock_irqsave(&conf->device_lock, flags); |
708 | bio_list_merge(&conf->pending_bio_list, &bl); | 807 | bio_list_merge(&conf->pending_bio_list, &bl); |
709 | bio_list_init(&bl); | 808 | bio_list_init(&bl); |
@@ -1471,6 +1570,17 @@ out: | |||
1471 | static int stop(mddev_t *mddev) | 1570 | static int stop(mddev_t *mddev) |
1472 | { | 1571 | { |
1473 | conf_t *conf = mddev_to_conf(mddev); | 1572 | conf_t *conf = mddev_to_conf(mddev); |
1573 | struct bitmap *bitmap = mddev->bitmap; | ||
1574 | int behind_wait = 0; | ||
1575 | |||
1576 | /* wait for behind writes to complete */ | ||
1577 | while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { | ||
1578 | behind_wait++; | ||
1579 | printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); | ||
1580 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1581 | schedule_timeout(HZ); /* wait a second */ | ||
1582 | /* need to kick something here to make sure I/O goes? */ | ||
1583 | } | ||
1474 | 1584 | ||
1475 | md_unregister_thread(mddev->thread); | 1585 | md_unregister_thread(mddev->thread); |
1476 | mddev->thread = NULL; | 1586 | mddev->thread = NULL; |