aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@cse.unsw.edu.au>2005-09-09 19:23:47 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-09 19:39:10 -0400
commit4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 (patch)
tree7b6cbc6a997e25a7fb6185da7129e539c4ffda8b /drivers/md/raid1.c
parent8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 (diff)
[PATCH] md: add write-behind support for md/raid1
If a device is flagged 'WriteMostly' and the array has a bitmap, and the bitmap superblock indicates that write_behind is allowed, then write_behind is enabled for WriteMostly devices. Write requests will be acknowledges as complete to the caller (via b_end_io) when all non-WriteMostly devices have completed the write, but will not be cleared from the bitmap until all devices complete. This requires memory allocation to make a local copy of the data being written. If there is insufficient memory, then we fall-back on normal write semantics. Signed-Off-By: Paul Clements <paul.clements@steeleye.com> Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c124
1 files changed, 117 insertions, 7 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 28839a8193f2..ba7f5f256161 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
222{ 222{
223 struct bio *bio = r1_bio->master_bio; 223 struct bio *bio = r1_bio->master_bio;
224 224
225 bio_endio(bio, bio->bi_size, 225 /* if nobody has done the final endio yet, do it now */
226 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); 226 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
227 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
228 (bio_data_dir(bio) == WRITE) ? "write" : "read",
229 (unsigned long long) bio->bi_sector,
230 (unsigned long long) bio->bi_sector +
231 (bio->bi_size >> 9) - 1);
232
233 bio_endio(bio, bio->bi_size,
234 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
235 }
227 free_r1bio(r1_bio); 236 free_r1bio(r1_bio);
228} 237}
229 238
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
292{ 301{
293 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
294 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
295 int mirror; 304 int mirror, behind;
296 conf_t *conf = mddev_to_conf(r1_bio->mddev); 305 conf_t *conf = mddev_to_conf(r1_bio->mddev);
297 306
298 if (bio->bi_size) 307 if (bio->bi_size)
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
323 332
324 update_head_pos(mirror, r1_bio); 333 update_head_pos(mirror, r1_bio);
325 334
335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
336 if (behind) {
337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
338 atomic_dec(&r1_bio->behind_remaining);
339
340 /* In behind mode, we ACK the master bio once the I/O has safely
341 * reached all non-writemostly disks. Setting the Returned bit
342 * ensures that this gets done only once -- we don't ever want to
343 * return -EIO here, instead we'll wait */
344
345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
346 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
347 /* Maybe we can return now */
348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
349 struct bio *mbio = r1_bio->master_bio;
350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
351 (unsigned long long) mbio->bi_sector,
352 (unsigned long long) mbio->bi_sector +
353 (mbio->bi_size >> 9) - 1);
354 bio_endio(mbio, mbio->bi_size, 0);
355 }
356 }
357 }
326 /* 358 /*
327 * 359 *
328 * Let's see if all mirrored write operations have finished 360 * Let's see if all mirrored write operations have finished
329 * already. 361 * already.
330 */ 362 */
331 if (atomic_dec_and_test(&r1_bio->remaining)) { 363 if (atomic_dec_and_test(&r1_bio->remaining)) {
364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
365 /* free extra copy of the data pages */
366 int i = bio->bi_vcnt;
367 while (i--)
368 __free_page(bio->bi_io_vec[i].bv_page);
369 }
332 /* clear the bitmap if all writes complete successfully */ 370 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 371 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors, 372 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state)); 373 !test_bit(R1BIO_Degraded, &r1_bio->state),
374 behind);
336 md_write_end(r1_bio->mddev); 375 md_write_end(r1_bio->mddev);
337 raid_end_bio_io(r1_bio); 376 raid_end_bio_io(r1_bio);
338 } 377 }
@@ -562,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect)
562 spin_unlock_irq(&conf->resync_lock); 601 spin_unlock_irq(&conf->resync_lock);
563} 602}
564 603
604/* duplicate the data pages for behind I/O */
605static struct page **alloc_behind_pages(struct bio *bio)
606{
607 int i;
608 struct bio_vec *bvec;
609 struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *),
610 GFP_NOIO);
611 if (unlikely(!pages))
612 goto do_sync_io;
613
614 memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
615
616 bio_for_each_segment(bvec, bio, i) {
617 pages[i] = alloc_page(GFP_NOIO);
618 if (unlikely(!pages[i]))
619 goto do_sync_io;
620 memcpy(kmap(pages[i]) + bvec->bv_offset,
621 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
622 kunmap(pages[i]);
623 kunmap(bvec->bv_page);
624 }
625
626 return pages;
627
628do_sync_io:
629 if (pages)
630 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
631 __free_page(pages[i]);
632 kfree(pages);
633 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
634 return NULL;
635}
636
565static int make_request(request_queue_t *q, struct bio * bio) 637static int make_request(request_queue_t *q, struct bio * bio)
566{ 638{
567 mddev_t *mddev = q->queuedata; 639 mddev_t *mddev = q->queuedata;
@@ -574,6 +646,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
574 struct bitmap *bitmap = mddev->bitmap; 646 struct bitmap *bitmap = mddev->bitmap;
575 unsigned long flags; 647 unsigned long flags;
576 struct bio_list bl; 648 struct bio_list bl;
649 struct page **behind_pages = NULL;
577 650
578 if (unlikely(bio_barrier(bio))) { 651 if (unlikely(bio_barrier(bio))) {
579 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 652 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -613,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio)
613 r1_bio->mddev = mddev; 686 r1_bio->mddev = mddev;
614 r1_bio->sector = bio->bi_sector; 687 r1_bio->sector = bio->bi_sector;
615 688
616 r1_bio->state = 0;
617
618 if (bio_data_dir(bio) == READ) { 689 if (bio_data_dir(bio) == READ) {
619 /* 690 /*
620 * read balancing logic: 691 * read balancing logic:
@@ -675,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio)
675 } 746 }
676 rcu_read_unlock(); 747 rcu_read_unlock();
677 748
749 BUG_ON(targets == 0); /* we never fail the last device */
750
678 if (targets < conf->raid_disks) { 751 if (targets < conf->raid_disks) {
679 /* array is degraded, we will not clear the bitmap 752 /* array is degraded, we will not clear the bitmap
680 * on I/O completion (see raid1_end_write_request) */ 753 * on I/O completion (see raid1_end_write_request) */
681 set_bit(R1BIO_Degraded, &r1_bio->state); 754 set_bit(R1BIO_Degraded, &r1_bio->state);
682 } 755 }
683 756
757 /* do behind I/O ? */
758 if (bitmap &&
759 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
760 (behind_pages = alloc_behind_pages(bio)) != NULL)
761 set_bit(R1BIO_BehindIO, &r1_bio->state);
762
684 atomic_set(&r1_bio->remaining, 0); 763 atomic_set(&r1_bio->remaining, 0);
764 atomic_set(&r1_bio->behind_remaining, 0);
685 765
686 bio_list_init(&bl); 766 bio_list_init(&bl);
687 for (i = 0; i < disks; i++) { 767 for (i = 0; i < disks; i++) {
@@ -698,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio)
698 mbio->bi_rw = WRITE; 778 mbio->bi_rw = WRITE;
699 mbio->bi_private = r1_bio; 779 mbio->bi_private = r1_bio;
700 780
781 if (behind_pages) {
782 struct bio_vec *bvec;
783 int j;
784
785 /* Yes, I really want the '__' version so that
786 * we clear any unused pointer in the io_vec, rather
787 * than leave them unchanged. This is important
788 * because when we come to free the pages, we won't
789 * know the originial bi_idx, so we just free
790 * them all
791 */
792 __bio_for_each_segment(bvec, mbio, j, 0)
793 bvec->bv_page = behind_pages[j];
794 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
795 atomic_inc(&r1_bio->behind_remaining);
796 }
797
701 atomic_inc(&r1_bio->remaining); 798 atomic_inc(&r1_bio->remaining);
702 799
703 bio_list_add(&bl, mbio); 800 bio_list_add(&bl, mbio);
704 } 801 }
802 kfree(behind_pages); /* the behind pages are attached to the bios now */
705 803
706 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); 804 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
805 test_bit(R1BIO_BehindIO, &r1_bio->state));
707 spin_lock_irqsave(&conf->device_lock, flags); 806 spin_lock_irqsave(&conf->device_lock, flags);
708 bio_list_merge(&conf->pending_bio_list, &bl); 807 bio_list_merge(&conf->pending_bio_list, &bl);
709 bio_list_init(&bl); 808 bio_list_init(&bl);
@@ -1471,6 +1570,17 @@ out:
1471static int stop(mddev_t *mddev) 1570static int stop(mddev_t *mddev)
1472{ 1571{
1473 conf_t *conf = mddev_to_conf(mddev); 1572 conf_t *conf = mddev_to_conf(mddev);
1573 struct bitmap *bitmap = mddev->bitmap;
1574 int behind_wait = 0;
1575
1576 /* wait for behind writes to complete */
1577 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
1578 behind_wait++;
1579 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
1580 set_current_state(TASK_UNINTERRUPTIBLE);
1581 schedule_timeout(HZ); /* wait a second */
1582 /* need to kick something here to make sure I/O goes? */
1583 }
1474 1584
1475 md_unregister_thread(mddev->thread); 1585 md_unregister_thread(mddev->thread);
1476 mddev->thread = NULL; 1586 mddev->thread = NULL;