diff options
Diffstat (limited to 'drivers/md/raid1.c')
| -rw-r--r-- | drivers/md/raid1.c | 124 |
1 files changed, 117 insertions, 7 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 28839a8193f2..ba7f5f256161 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
| 222 | { | 222 | { |
| 223 | struct bio *bio = r1_bio->master_bio; | 223 | struct bio *bio = r1_bio->master_bio; |
| 224 | 224 | ||
| 225 | bio_endio(bio, bio->bi_size, | 225 | /* if nobody has done the final endio yet, do it now */ |
| 226 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | 226 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
| 227 | PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", | ||
| 228 | (bio_data_dir(bio) == WRITE) ? "write" : "read", | ||
| 229 | (unsigned long long) bio->bi_sector, | ||
| 230 | (unsigned long long) bio->bi_sector + | ||
| 231 | (bio->bi_size >> 9) - 1); | ||
| 232 | |||
| 233 | bio_endio(bio, bio->bi_size, | ||
| 234 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | ||
| 235 | } | ||
| 227 | free_r1bio(r1_bio); | 236 | free_r1bio(r1_bio); |
| 228 | } | 237 | } |
| 229 | 238 | ||
| @@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
| 292 | { | 301 | { |
| 293 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 302 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 294 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 303 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
| 295 | int mirror; | 304 | int mirror, behind; |
| 296 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 305 | conf_t *conf = mddev_to_conf(r1_bio->mddev); |
| 297 | 306 | ||
| 298 | if (bio->bi_size) | 307 | if (bio->bi_size) |
| @@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
| 323 | 332 | ||
| 324 | update_head_pos(mirror, r1_bio); | 333 | update_head_pos(mirror, r1_bio); |
| 325 | 334 | ||
| 335 | behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | ||
| 336 | if (behind) { | ||
| 337 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | ||
| 338 | atomic_dec(&r1_bio->behind_remaining); | ||
| 339 | |||
| 340 | /* In behind mode, we ACK the master bio once the I/O has safely | ||
| 341 | * reached all non-writemostly disks. Setting the Returned bit | ||
| 342 | * ensures that this gets done only once -- we don't ever want to | ||
| 343 | * return -EIO here, instead we'll wait */ | ||
| 344 | |||
| 345 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
| 346 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
| 347 | /* Maybe we can return now */ | ||
| 348 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
| 349 | struct bio *mbio = r1_bio->master_bio; | ||
| 350 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
| 351 | (unsigned long long) mbio->bi_sector, | ||
| 352 | (unsigned long long) mbio->bi_sector + | ||
| 353 | (mbio->bi_size >> 9) - 1); | ||
| 354 | bio_endio(mbio, mbio->bi_size, 0); | ||
| 355 | } | ||
| 356 | } | ||
| 357 | } | ||
| 326 | /* | 358 | /* |
| 327 | * | 359 | * |
| 328 | * Let's see if all mirrored write operations have finished | 360 | * Let's see if all mirrored write operations have finished |
| 329 | * already. | 361 | * already. |
| 330 | */ | 362 | */ |
| 331 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 363 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
| 364 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
| 365 | /* free extra copy of the data pages */ | ||
| 366 | int i = bio->bi_vcnt; | ||
| 367 | while (i--) | ||
| 368 | __free_page(bio->bi_io_vec[i].bv_page); | ||
| 369 | } | ||
| 332 | /* clear the bitmap if all writes complete successfully */ | 370 | /* clear the bitmap if all writes complete successfully */ |
| 333 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | 371 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, |
| 334 | r1_bio->sectors, | 372 | r1_bio->sectors, |
| 335 | !test_bit(R1BIO_Degraded, &r1_bio->state)); | 373 | !test_bit(R1BIO_Degraded, &r1_bio->state), |
| 374 | behind); | ||
| 336 | md_write_end(r1_bio->mddev); | 375 | md_write_end(r1_bio->mddev); |
| 337 | raid_end_bio_io(r1_bio); | 376 | raid_end_bio_io(r1_bio); |
| 338 | } | 377 | } |
| @@ -562,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect) | |||
| 562 | spin_unlock_irq(&conf->resync_lock); | 601 | spin_unlock_irq(&conf->resync_lock); |
| 563 | } | 602 | } |
| 564 | 603 | ||
| 604 | /* duplicate the data pages for behind I/O */ | ||
| 605 | static struct page **alloc_behind_pages(struct bio *bio) | ||
| 606 | { | ||
| 607 | int i; | ||
| 608 | struct bio_vec *bvec; | ||
| 609 | struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), | ||
| 610 | GFP_NOIO); | ||
| 611 | if (unlikely(!pages)) | ||
| 612 | goto do_sync_io; | ||
| 613 | |||
| 614 | memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); | ||
| 615 | |||
| 616 | bio_for_each_segment(bvec, bio, i) { | ||
| 617 | pages[i] = alloc_page(GFP_NOIO); | ||
| 618 | if (unlikely(!pages[i])) | ||
| 619 | goto do_sync_io; | ||
| 620 | memcpy(kmap(pages[i]) + bvec->bv_offset, | ||
| 621 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | ||
| 622 | kunmap(pages[i]); | ||
| 623 | kunmap(bvec->bv_page); | ||
| 624 | } | ||
| 625 | |||
| 626 | return pages; | ||
| 627 | |||
| 628 | do_sync_io: | ||
| 629 | if (pages) | ||
| 630 | for (i = 0; i < bio->bi_vcnt && pages[i]; i++) | ||
| 631 | __free_page(pages[i]); | ||
| 632 | kfree(pages); | ||
| 633 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | ||
| 634 | return NULL; | ||
| 635 | } | ||
| 636 | |||
| 565 | static int make_request(request_queue_t *q, struct bio * bio) | 637 | static int make_request(request_queue_t *q, struct bio * bio) |
| 566 | { | 638 | { |
| 567 | mddev_t *mddev = q->queuedata; | 639 | mddev_t *mddev = q->queuedata; |
| @@ -574,6 +646,7 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
| 574 | struct bitmap *bitmap = mddev->bitmap; | 646 | struct bitmap *bitmap = mddev->bitmap; |
| 575 | unsigned long flags; | 647 | unsigned long flags; |
| 576 | struct bio_list bl; | 648 | struct bio_list bl; |
| 649 | struct page **behind_pages = NULL; | ||
| 577 | 650 | ||
| 578 | if (unlikely(bio_barrier(bio))) { | 651 | if (unlikely(bio_barrier(bio))) { |
| 579 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | 652 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); |
| @@ -613,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
| 613 | r1_bio->mddev = mddev; | 686 | r1_bio->mddev = mddev; |
| 614 | r1_bio->sector = bio->bi_sector; | 687 | r1_bio->sector = bio->bi_sector; |
| 615 | 688 | ||
| 616 | r1_bio->state = 0; | ||
| 617 | |||
| 618 | if (bio_data_dir(bio) == READ) { | 689 | if (bio_data_dir(bio) == READ) { |
| 619 | /* | 690 | /* |
| 620 | * read balancing logic: | 691 | * read balancing logic: |
| @@ -675,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
| 675 | } | 746 | } |
| 676 | rcu_read_unlock(); | 747 | rcu_read_unlock(); |
| 677 | 748 | ||
| 749 | BUG_ON(targets == 0); /* we never fail the last device */ | ||
| 750 | |||
| 678 | if (targets < conf->raid_disks) { | 751 | if (targets < conf->raid_disks) { |
| 679 | /* array is degraded, we will not clear the bitmap | 752 | /* array is degraded, we will not clear the bitmap |
| 680 | * on I/O completion (see raid1_end_write_request) */ | 753 | * on I/O completion (see raid1_end_write_request) */ |
| 681 | set_bit(R1BIO_Degraded, &r1_bio->state); | 754 | set_bit(R1BIO_Degraded, &r1_bio->state); |
| 682 | } | 755 | } |
| 683 | 756 | ||
| 757 | /* do behind I/O ? */ | ||
| 758 | if (bitmap && | ||
| 759 | atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && | ||
| 760 | (behind_pages = alloc_behind_pages(bio)) != NULL) | ||
| 761 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
| 762 | |||
| 684 | atomic_set(&r1_bio->remaining, 0); | 763 | atomic_set(&r1_bio->remaining, 0); |
| 764 | atomic_set(&r1_bio->behind_remaining, 0); | ||
| 685 | 765 | ||
| 686 | bio_list_init(&bl); | 766 | bio_list_init(&bl); |
| 687 | for (i = 0; i < disks; i++) { | 767 | for (i = 0; i < disks; i++) { |
| @@ -698,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
| 698 | mbio->bi_rw = WRITE; | 778 | mbio->bi_rw = WRITE; |
| 699 | mbio->bi_private = r1_bio; | 779 | mbio->bi_private = r1_bio; |
| 700 | 780 | ||
| 781 | if (behind_pages) { | ||
| 782 | struct bio_vec *bvec; | ||
| 783 | int j; | ||
| 784 | |||
| 785 | /* Yes, I really want the '__' version so that | ||
| 786 | * we clear any unused pointer in the io_vec, rather | ||
| 787 | * than leave them unchanged. This is important | ||
| 788 | * because when we come to free the pages, we won't | ||
| 789 | * know the originial bi_idx, so we just free | ||
| 790 | * them all | ||
| 791 | */ | ||
| 792 | __bio_for_each_segment(bvec, mbio, j, 0) | ||
| 793 | bvec->bv_page = behind_pages[j]; | ||
| 794 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | ||
| 795 | atomic_inc(&r1_bio->behind_remaining); | ||
| 796 | } | ||
| 797 | |||
| 701 | atomic_inc(&r1_bio->remaining); | 798 | atomic_inc(&r1_bio->remaining); |
| 702 | 799 | ||
| 703 | bio_list_add(&bl, mbio); | 800 | bio_list_add(&bl, mbio); |
| 704 | } | 801 | } |
| 802 | kfree(behind_pages); /* the behind pages are attached to the bios now */ | ||
| 705 | 803 | ||
| 706 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); | 804 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, |
| 805 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
| 707 | spin_lock_irqsave(&conf->device_lock, flags); | 806 | spin_lock_irqsave(&conf->device_lock, flags); |
| 708 | bio_list_merge(&conf->pending_bio_list, &bl); | 807 | bio_list_merge(&conf->pending_bio_list, &bl); |
| 709 | bio_list_init(&bl); | 808 | bio_list_init(&bl); |
| @@ -1471,6 +1570,17 @@ out: | |||
| 1471 | static int stop(mddev_t *mddev) | 1570 | static int stop(mddev_t *mddev) |
| 1472 | { | 1571 | { |
| 1473 | conf_t *conf = mddev_to_conf(mddev); | 1572 | conf_t *conf = mddev_to_conf(mddev); |
| 1573 | struct bitmap *bitmap = mddev->bitmap; | ||
| 1574 | int behind_wait = 0; | ||
| 1575 | |||
| 1576 | /* wait for behind writes to complete */ | ||
| 1577 | while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { | ||
| 1578 | behind_wait++; | ||
| 1579 | printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); | ||
| 1580 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 1581 | schedule_timeout(HZ); /* wait a second */ | ||
| 1582 | /* need to kick something here to make sure I/O goes? */ | ||
| 1583 | } | ||
| 1474 | 1584 | ||
| 1475 | md_unregister_thread(mddev->thread); | 1585 | md_unregister_thread(mddev->thread); |
| 1476 | mddev->thread = NULL; | 1586 | mddev->thread = NULL; |
