diff options
Diffstat (limited to 'drivers/md/raid5.c')
| -rw-r--r-- | drivers/md/raid5.c | 252 |
1 files changed, 178 insertions, 74 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f351422938e0..d26767246d26 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
| 488 | return sh; | 488 | return sh; |
| 489 | } | 489 | } |
| 490 | 490 | ||
| 491 | /* Determine if 'data_offset' or 'new_data_offset' should be used | ||
| 492 | * in this stripe_head. | ||
| 493 | */ | ||
| 494 | static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) | ||
| 495 | { | ||
| 496 | sector_t progress = conf->reshape_progress; | ||
| 497 | /* Need a memory barrier to make sure we see the value | ||
| 498 | * of conf->generation, or ->data_offset that was set before | ||
| 499 | * reshape_progress was updated. | ||
| 500 | */ | ||
| 501 | smp_rmb(); | ||
| 502 | if (progress == MaxSector) | ||
| 503 | return 0; | ||
| 504 | if (sh->generation == conf->generation - 1) | ||
| 505 | return 0; | ||
| 506 | /* We are in a reshape, and this is a new-generation stripe, | ||
| 507 | * so use new_data_offset. | ||
| 508 | */ | ||
| 509 | return 1; | ||
| 510 | } | ||
| 511 | |||
| 491 | static void | 512 | static void |
| 492 | raid5_end_read_request(struct bio *bi, int error); | 513 | raid5_end_read_request(struct bio *bi, int error); |
| 493 | static void | 514 | static void |
| @@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 518 | replace_only = 1; | 539 | replace_only = 1; |
| 519 | } else | 540 | } else |
| 520 | continue; | 541 | continue; |
| 542 | if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) | ||
| 543 | rw |= REQ_SYNC; | ||
| 521 | 544 | ||
| 522 | bi = &sh->dev[i].req; | 545 | bi = &sh->dev[i].req; |
| 523 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ | 546 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ |
| @@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 603 | __func__, (unsigned long long)sh->sector, | 626 | __func__, (unsigned long long)sh->sector, |
| 604 | bi->bi_rw, i); | 627 | bi->bi_rw, i); |
| 605 | atomic_inc(&sh->count); | 628 | atomic_inc(&sh->count); |
| 606 | bi->bi_sector = sh->sector + rdev->data_offset; | 629 | if (use_new_offset(conf, sh)) |
| 630 | bi->bi_sector = (sh->sector | ||
| 631 | + rdev->new_data_offset); | ||
| 632 | else | ||
| 633 | bi->bi_sector = (sh->sector | ||
| 634 | + rdev->data_offset); | ||
| 607 | bi->bi_flags = 1 << BIO_UPTODATE; | 635 | bi->bi_flags = 1 << BIO_UPTODATE; |
| 608 | bi->bi_idx = 0; | 636 | bi->bi_idx = 0; |
| 609 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 637 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
| @@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 627 | __func__, (unsigned long long)sh->sector, | 655 | __func__, (unsigned long long)sh->sector, |
| 628 | rbi->bi_rw, i); | 656 | rbi->bi_rw, i); |
| 629 | atomic_inc(&sh->count); | 657 | atomic_inc(&sh->count); |
| 630 | rbi->bi_sector = sh->sector + rrdev->data_offset; | 658 | if (use_new_offset(conf, sh)) |
| 659 | rbi->bi_sector = (sh->sector | ||
| 660 | + rrdev->new_data_offset); | ||
| 661 | else | ||
| 662 | rbi->bi_sector = (sh->sector | ||
| 663 | + rrdev->data_offset); | ||
| 631 | rbi->bi_flags = 1 << BIO_UPTODATE; | 664 | rbi->bi_flags = 1 << BIO_UPTODATE; |
| 632 | rbi->bi_idx = 0; | 665 | rbi->bi_idx = 0; |
| 633 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 666 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
| @@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 1114 | dev->sector + STRIPE_SECTORS) { | 1147 | dev->sector + STRIPE_SECTORS) { |
| 1115 | if (wbi->bi_rw & REQ_FUA) | 1148 | if (wbi->bi_rw & REQ_FUA) |
| 1116 | set_bit(R5_WantFUA, &dev->flags); | 1149 | set_bit(R5_WantFUA, &dev->flags); |
| 1150 | if (wbi->bi_rw & REQ_SYNC) | ||
| 1151 | set_bit(R5_SyncIO, &dev->flags); | ||
| 1117 | tx = async_copy_data(1, wbi, dev->page, | 1152 | tx = async_copy_data(1, wbi, dev->page, |
| 1118 | dev->sector, tx); | 1153 | dev->sector, tx); |
| 1119 | wbi = r5_next_bio(wbi, dev->sector); | 1154 | wbi = r5_next_bio(wbi, dev->sector); |
| @@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
| 1131 | int pd_idx = sh->pd_idx; | 1166 | int pd_idx = sh->pd_idx; |
| 1132 | int qd_idx = sh->qd_idx; | 1167 | int qd_idx = sh->qd_idx; |
| 1133 | int i; | 1168 | int i; |
| 1134 | bool fua = false; | 1169 | bool fua = false, sync = false; |
| 1135 | 1170 | ||
| 1136 | pr_debug("%s: stripe %llu\n", __func__, | 1171 | pr_debug("%s: stripe %llu\n", __func__, |
| 1137 | (unsigned long long)sh->sector); | 1172 | (unsigned long long)sh->sector); |
| 1138 | 1173 | ||
| 1139 | for (i = disks; i--; ) | 1174 | for (i = disks; i--; ) { |
| 1140 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); | 1175 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); |
| 1176 | sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); | ||
| 1177 | } | ||
| 1141 | 1178 | ||
| 1142 | for (i = disks; i--; ) { | 1179 | for (i = disks; i--; ) { |
| 1143 | struct r5dev *dev = &sh->dev[i]; | 1180 | struct r5dev *dev = &sh->dev[i]; |
| @@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
| 1146 | set_bit(R5_UPTODATE, &dev->flags); | 1183 | set_bit(R5_UPTODATE, &dev->flags); |
| 1147 | if (fua) | 1184 | if (fua) |
| 1148 | set_bit(R5_WantFUA, &dev->flags); | 1185 | set_bit(R5_WantFUA, &dev->flags); |
| 1186 | if (sync) | ||
| 1187 | set_bit(R5_SyncIO, &dev->flags); | ||
| 1149 | } | 1188 | } |
| 1150 | } | 1189 | } |
| 1151 | 1190 | ||
| @@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1648 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1687 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
| 1649 | char b[BDEVNAME_SIZE]; | 1688 | char b[BDEVNAME_SIZE]; |
| 1650 | struct md_rdev *rdev = NULL; | 1689 | struct md_rdev *rdev = NULL; |
| 1651 | 1690 | sector_t s; | |
| 1652 | 1691 | ||
| 1653 | for (i=0 ; i<disks; i++) | 1692 | for (i=0 ; i<disks; i++) |
| 1654 | if (bi == &sh->dev[i].req) | 1693 | if (bi == &sh->dev[i].req) |
| @@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1671 | if (!rdev) | 1710 | if (!rdev) |
| 1672 | rdev = conf->disks[i].rdev; | 1711 | rdev = conf->disks[i].rdev; |
| 1673 | 1712 | ||
| 1713 | if (use_new_offset(conf, sh)) | ||
| 1714 | s = sh->sector + rdev->new_data_offset; | ||
| 1715 | else | ||
| 1716 | s = sh->sector + rdev->data_offset; | ||
| 1674 | if (uptodate) { | 1717 | if (uptodate) { |
| 1675 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1718 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
| 1676 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1719 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
| @@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1683 | "md/raid:%s: read error corrected" | 1726 | "md/raid:%s: read error corrected" |
| 1684 | " (%lu sectors at %llu on %s)\n", | 1727 | " (%lu sectors at %llu on %s)\n", |
| 1685 | mdname(conf->mddev), STRIPE_SECTORS, | 1728 | mdname(conf->mddev), STRIPE_SECTORS, |
| 1686 | (unsigned long long)(sh->sector | 1729 | (unsigned long long)s, |
| 1687 | + rdev->data_offset), | ||
| 1688 | bdevname(rdev->bdev, b)); | 1730 | bdevname(rdev->bdev, b)); |
| 1689 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | 1731 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); |
| 1690 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1732 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
| @@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1704 | "md/raid:%s: read error on replacement device " | 1746 | "md/raid:%s: read error on replacement device " |
| 1705 | "(sector %llu on %s).\n", | 1747 | "(sector %llu on %s).\n", |
| 1706 | mdname(conf->mddev), | 1748 | mdname(conf->mddev), |
| 1707 | (unsigned long long)(sh->sector | 1749 | (unsigned long long)s, |
| 1708 | + rdev->data_offset), | ||
| 1709 | bdn); | 1750 | bdn); |
| 1710 | else if (conf->mddev->degraded >= conf->max_degraded) | 1751 | else if (conf->mddev->degraded >= conf->max_degraded) |
| 1711 | printk_ratelimited( | 1752 | printk_ratelimited( |
| @@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1713 | "md/raid:%s: read error not correctable " | 1754 | "md/raid:%s: read error not correctable " |
| 1714 | "(sector %llu on %s).\n", | 1755 | "(sector %llu on %s).\n", |
| 1715 | mdname(conf->mddev), | 1756 | mdname(conf->mddev), |
| 1716 | (unsigned long long)(sh->sector | 1757 | (unsigned long long)s, |
| 1717 | + rdev->data_offset), | ||
| 1718 | bdn); | 1758 | bdn); |
| 1719 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1759 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) |
| 1720 | /* Oh, no!!! */ | 1760 | /* Oh, no!!! */ |
| @@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1723 | "md/raid:%s: read error NOT corrected!! " | 1763 | "md/raid:%s: read error NOT corrected!! " |
| 1724 | "(sector %llu on %s).\n", | 1764 | "(sector %llu on %s).\n", |
| 1725 | mdname(conf->mddev), | 1765 | mdname(conf->mddev), |
| 1726 | (unsigned long long)(sh->sector | 1766 | (unsigned long long)s, |
| 1727 | + rdev->data_offset), | ||
| 1728 | bdn); | 1767 | bdn); |
| 1729 | else if (atomic_read(&rdev->read_errors) | 1768 | else if (atomic_read(&rdev->read_errors) |
| 1730 | > conf->max_nr_stripes) | 1769 | > conf->max_nr_stripes) |
| @@ -3561,7 +3600,7 @@ finish: | |||
| 3561 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { | 3600 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { |
| 3562 | rdev = conf->disks[i].rdev; | 3601 | rdev = conf->disks[i].rdev; |
| 3563 | rdev_clear_badblocks(rdev, sh->sector, | 3602 | rdev_clear_badblocks(rdev, sh->sector, |
| 3564 | STRIPE_SECTORS); | 3603 | STRIPE_SECTORS, 0); |
| 3565 | rdev_dec_pending(rdev, conf->mddev); | 3604 | rdev_dec_pending(rdev, conf->mddev); |
| 3566 | } | 3605 | } |
| 3567 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { | 3606 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { |
| @@ -3570,7 +3609,7 @@ finish: | |||
| 3570 | /* rdev have been moved down */ | 3609 | /* rdev have been moved down */ |
| 3571 | rdev = conf->disks[i].rdev; | 3610 | rdev = conf->disks[i].rdev; |
| 3572 | rdev_clear_badblocks(rdev, sh->sector, | 3611 | rdev_clear_badblocks(rdev, sh->sector, |
| 3573 | STRIPE_SECTORS); | 3612 | STRIPE_SECTORS, 0); |
| 3574 | rdev_dec_pending(rdev, conf->mddev); | 3613 | rdev_dec_pending(rdev, conf->mddev); |
| 3575 | } | 3614 | } |
| 3576 | } | 3615 | } |
| @@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
| 3842 | raid_bio->bi_next = (void*)rdev; | 3881 | raid_bio->bi_next = (void*)rdev; |
| 3843 | align_bi->bi_bdev = rdev->bdev; | 3882 | align_bi->bi_bdev = rdev->bdev; |
| 3844 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 3883 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
| 3884 | /* No reshape active, so we can trust rdev->data_offset */ | ||
| 3845 | align_bi->bi_sector += rdev->data_offset; | 3885 | align_bi->bi_sector += rdev->data_offset; |
| 3846 | 3886 | ||
| 3847 | if (!bio_fits_rdev(align_bi) || | 3887 | if (!bio_fits_rdev(align_bi) || |
| @@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 3953 | plugged = mddev_check_plugged(mddev); | 3993 | plugged = mddev_check_plugged(mddev); |
| 3954 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | 3994 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { |
| 3955 | DEFINE_WAIT(w); | 3995 | DEFINE_WAIT(w); |
| 3956 | int disks, data_disks; | ||
| 3957 | int previous; | 3996 | int previous; |
| 3958 | 3997 | ||
| 3959 | retry: | 3998 | retry: |
| 3960 | previous = 0; | 3999 | previous = 0; |
| 3961 | disks = conf->raid_disks; | ||
| 3962 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); | 4000 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); |
| 3963 | if (unlikely(conf->reshape_progress != MaxSector)) { | 4001 | if (unlikely(conf->reshape_progress != MaxSector)) { |
| 3964 | /* spinlock is needed as reshape_progress may be | 4002 | /* spinlock is needed as reshape_progress may be |
| @@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 3970 | * to check again. | 4008 | * to check again. |
| 3971 | */ | 4009 | */ |
| 3972 | spin_lock_irq(&conf->device_lock); | 4010 | spin_lock_irq(&conf->device_lock); |
| 3973 | if (mddev->delta_disks < 0 | 4011 | if (mddev->reshape_backwards |
| 3974 | ? logical_sector < conf->reshape_progress | 4012 | ? logical_sector < conf->reshape_progress |
| 3975 | : logical_sector >= conf->reshape_progress) { | 4013 | : logical_sector >= conf->reshape_progress) { |
| 3976 | disks = conf->previous_raid_disks; | ||
| 3977 | previous = 1; | 4014 | previous = 1; |
| 3978 | } else { | 4015 | } else { |
| 3979 | if (mddev->delta_disks < 0 | 4016 | if (mddev->reshape_backwards |
| 3980 | ? logical_sector < conf->reshape_safe | 4017 | ? logical_sector < conf->reshape_safe |
| 3981 | : logical_sector >= conf->reshape_safe) { | 4018 | : logical_sector >= conf->reshape_safe) { |
| 3982 | spin_unlock_irq(&conf->device_lock); | 4019 | spin_unlock_irq(&conf->device_lock); |
| @@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 3986 | } | 4023 | } |
| 3987 | spin_unlock_irq(&conf->device_lock); | 4024 | spin_unlock_irq(&conf->device_lock); |
| 3988 | } | 4025 | } |
| 3989 | data_disks = disks - conf->max_degraded; | ||
| 3990 | 4026 | ||
| 3991 | new_sector = raid5_compute_sector(conf, logical_sector, | 4027 | new_sector = raid5_compute_sector(conf, logical_sector, |
| 3992 | previous, | 4028 | previous, |
| @@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4009 | */ | 4045 | */ |
| 4010 | int must_retry = 0; | 4046 | int must_retry = 0; |
| 4011 | spin_lock_irq(&conf->device_lock); | 4047 | spin_lock_irq(&conf->device_lock); |
| 4012 | if (mddev->delta_disks < 0 | 4048 | if (mddev->reshape_backwards |
| 4013 | ? logical_sector >= conf->reshape_progress | 4049 | ? logical_sector >= conf->reshape_progress |
| 4014 | : logical_sector < conf->reshape_progress) | 4050 | : logical_sector < conf->reshape_progress) |
| 4015 | /* mismatch, need to try again */ | 4051 | /* mismatch, need to try again */ |
| @@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4108 | 4144 | ||
| 4109 | if (sector_nr == 0) { | 4145 | if (sector_nr == 0) { |
| 4110 | /* If restarting in the middle, skip the initial sectors */ | 4146 | /* If restarting in the middle, skip the initial sectors */ |
| 4111 | if (mddev->delta_disks < 0 && | 4147 | if (mddev->reshape_backwards && |
| 4112 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { | 4148 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { |
| 4113 | sector_nr = raid5_size(mddev, 0, 0) | 4149 | sector_nr = raid5_size(mddev, 0, 0) |
| 4114 | - conf->reshape_progress; | 4150 | - conf->reshape_progress; |
| 4115 | } else if (mddev->delta_disks >= 0 && | 4151 | } else if (!mddev->reshape_backwards && |
| 4116 | conf->reshape_progress > 0) | 4152 | conf->reshape_progress > 0) |
| 4117 | sector_nr = conf->reshape_progress; | 4153 | sector_nr = conf->reshape_progress; |
| 4118 | sector_div(sector_nr, new_data_disks); | 4154 | sector_div(sector_nr, new_data_disks); |
| @@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4133 | else | 4169 | else |
| 4134 | reshape_sectors = mddev->chunk_sectors; | 4170 | reshape_sectors = mddev->chunk_sectors; |
| 4135 | 4171 | ||
| 4136 | /* we update the metadata when there is more than 3Meg | 4172 | /* We update the metadata at least every 10 seconds, or when |
| 4137 | * in the block range (that is rather arbitrary, should | 4173 | * the data about to be copied would over-write the source of |
| 4138 | * probably be time based) or when the data about to be | 4174 | * the data at the front of the range. i.e. one new_stripe |
| 4139 | * copied would over-write the source of the data at | 4175 | * along from reshape_progress new_maps to after where |
| 4140 | * the front of the range. | 4176 | * reshape_safe old_maps to |
| 4141 | * i.e. one new_stripe along from reshape_progress new_maps | ||
| 4142 | * to after where reshape_safe old_maps to | ||
| 4143 | */ | 4177 | */ |
| 4144 | writepos = conf->reshape_progress; | 4178 | writepos = conf->reshape_progress; |
| 4145 | sector_div(writepos, new_data_disks); | 4179 | sector_div(writepos, new_data_disks); |
| @@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4147 | sector_div(readpos, data_disks); | 4181 | sector_div(readpos, data_disks); |
| 4148 | safepos = conf->reshape_safe; | 4182 | safepos = conf->reshape_safe; |
| 4149 | sector_div(safepos, data_disks); | 4183 | sector_div(safepos, data_disks); |
| 4150 | if (mddev->delta_disks < 0) { | 4184 | if (mddev->reshape_backwards) { |
| 4151 | writepos -= min_t(sector_t, reshape_sectors, writepos); | 4185 | writepos -= min_t(sector_t, reshape_sectors, writepos); |
| 4152 | readpos += reshape_sectors; | 4186 | readpos += reshape_sectors; |
| 4153 | safepos += reshape_sectors; | 4187 | safepos += reshape_sectors; |
| @@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4157 | safepos -= min_t(sector_t, reshape_sectors, safepos); | 4191 | safepos -= min_t(sector_t, reshape_sectors, safepos); |
| 4158 | } | 4192 | } |
| 4159 | 4193 | ||
| 4194 | /* Having calculated the 'writepos' possibly use it | ||
| 4195 | * to set 'stripe_addr' which is where we will write to. | ||
| 4196 | */ | ||
| 4197 | if (mddev->reshape_backwards) { | ||
| 4198 | BUG_ON(conf->reshape_progress == 0); | ||
| 4199 | stripe_addr = writepos; | ||
| 4200 | BUG_ON((mddev->dev_sectors & | ||
| 4201 | ~((sector_t)reshape_sectors - 1)) | ||
| 4202 | - reshape_sectors - stripe_addr | ||
| 4203 | != sector_nr); | ||
| 4204 | } else { | ||
| 4205 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
| 4206 | stripe_addr = sector_nr; | ||
| 4207 | } | ||
| 4208 | |||
| 4160 | /* 'writepos' is the most advanced device address we might write. | 4209 | /* 'writepos' is the most advanced device address we might write. |
| 4161 | * 'readpos' is the least advanced device address we might read. | 4210 | * 'readpos' is the least advanced device address we might read. |
| 4162 | * 'safepos' is the least address recorded in the metadata as having | 4211 | * 'safepos' is the least address recorded in the metadata as having |
| 4163 | * been reshaped. | 4212 | * been reshaped. |
| 4164 | * If 'readpos' is behind 'writepos', then there is no way that we can | 4213 | * If there is a min_offset_diff, these are adjusted either by |
| 4214 | * increasing the safepos/readpos if diff is negative, or | ||
| 4215 | * increasing writepos if diff is positive. | ||
| 4216 | * If 'readpos' is then behind 'writepos', there is no way that we can | ||
| 4165 | * ensure safety in the face of a crash - that must be done by userspace | 4217 | * ensure safety in the face of a crash - that must be done by userspace |
| 4166 | * making a backup of the data. So in that case there is no particular | 4218 | * making a backup of the data. So in that case there is no particular |
| 4167 | * rush to update metadata. | 4219 | * rush to update metadata. |
| @@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4174 | * Maybe that number should be configurable, but I'm not sure it is | 4226 | * Maybe that number should be configurable, but I'm not sure it is |
| 4175 | * worth it.... maybe it could be a multiple of safemode_delay??? | 4227 | * worth it.... maybe it could be a multiple of safemode_delay??? |
| 4176 | */ | 4228 | */ |
| 4177 | if ((mddev->delta_disks < 0 | 4229 | if (conf->min_offset_diff < 0) { |
| 4230 | safepos += -conf->min_offset_diff; | ||
| 4231 | readpos += -conf->min_offset_diff; | ||
| 4232 | } else | ||
| 4233 | writepos += conf->min_offset_diff; | ||
| 4234 | |||
| 4235 | if ((mddev->reshape_backwards | ||
| 4178 | ? (safepos > writepos && readpos < writepos) | 4236 | ? (safepos > writepos && readpos < writepos) |
| 4179 | : (safepos < writepos && readpos > writepos)) || | 4237 | : (safepos < writepos && readpos > writepos)) || |
| 4180 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | 4238 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { |
| @@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4195 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 4253 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
| 4196 | } | 4254 | } |
| 4197 | 4255 | ||
| 4198 | if (mddev->delta_disks < 0) { | ||
| 4199 | BUG_ON(conf->reshape_progress == 0); | ||
| 4200 | stripe_addr = writepos; | ||
| 4201 | BUG_ON((mddev->dev_sectors & | ||
| 4202 | ~((sector_t)reshape_sectors - 1)) | ||
| 4203 | - reshape_sectors - stripe_addr | ||
| 4204 | != sector_nr); | ||
| 4205 | } else { | ||
| 4206 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
| 4207 | stripe_addr = sector_nr; | ||
| 4208 | } | ||
| 4209 | INIT_LIST_HEAD(&stripes); | 4256 | INIT_LIST_HEAD(&stripes); |
| 4210 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { | 4257 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { |
| 4211 | int j; | 4258 | int j; |
| @@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4239 | list_add(&sh->lru, &stripes); | 4286 | list_add(&sh->lru, &stripes); |
| 4240 | } | 4287 | } |
| 4241 | spin_lock_irq(&conf->device_lock); | 4288 | spin_lock_irq(&conf->device_lock); |
| 4242 | if (mddev->delta_disks < 0) | 4289 | if (mddev->reshape_backwards) |
| 4243 | conf->reshape_progress -= reshape_sectors * new_data_disks; | 4290 | conf->reshape_progress -= reshape_sectors * new_data_disks; |
| 4244 | else | 4291 | else |
| 4245 | conf->reshape_progress += reshape_sectors * new_data_disks; | 4292 | conf->reshape_progress += reshape_sectors * new_data_disks; |
| @@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev) | |||
| 4952 | struct md_rdev *rdev; | 4999 | struct md_rdev *rdev; |
| 4953 | sector_t reshape_offset = 0; | 5000 | sector_t reshape_offset = 0; |
| 4954 | int i; | 5001 | int i; |
| 5002 | long long min_offset_diff = 0; | ||
| 5003 | int first = 1; | ||
| 4955 | 5004 | ||
| 4956 | if (mddev->recovery_cp != MaxSector) | 5005 | if (mddev->recovery_cp != MaxSector) |
| 4957 | printk(KERN_NOTICE "md/raid:%s: not clean" | 5006 | printk(KERN_NOTICE "md/raid:%s: not clean" |
| 4958 | " -- starting background reconstruction\n", | 5007 | " -- starting background reconstruction\n", |
| 4959 | mdname(mddev)); | 5008 | mdname(mddev)); |
| 5009 | |||
| 5010 | rdev_for_each(rdev, mddev) { | ||
| 5011 | long long diff; | ||
| 5012 | if (rdev->raid_disk < 0) | ||
| 5013 | continue; | ||
| 5014 | diff = (rdev->new_data_offset - rdev->data_offset); | ||
| 5015 | if (first) { | ||
| 5016 | min_offset_diff = diff; | ||
| 5017 | first = 0; | ||
| 5018 | } else if (mddev->reshape_backwards && | ||
| 5019 | diff < min_offset_diff) | ||
| 5020 | min_offset_diff = diff; | ||
| 5021 | else if (!mddev->reshape_backwards && | ||
| 5022 | diff > min_offset_diff) | ||
| 5023 | min_offset_diff = diff; | ||
| 5024 | } | ||
| 5025 | |||
| 4960 | if (mddev->reshape_position != MaxSector) { | 5026 | if (mddev->reshape_position != MaxSector) { |
| 4961 | /* Check that we can continue the reshape. | 5027 | /* Check that we can continue the reshape. |
| 4962 | * Currently only disks can change, it must | 5028 | * Difficulties arise if the stripe we would write to |
| 4963 | * increase, and we must be past the point where | 5029 | * next is at or after the stripe we would read from next. |
| 4964 | * a stripe over-writes itself | 5030 | * For a reshape that changes the number of devices, this |
| 5031 | * is only possible for a very short time, and mdadm makes | ||
| 5032 | * sure that time appears to have past before assembling | ||
| 5033 | * the array. So we fail if that time hasn't passed. | ||
| 5034 | * For a reshape that keeps the number of devices the same | ||
| 5035 | * mdadm must be monitoring the reshape can keeping the | ||
| 5036 | * critical areas read-only and backed up. It will start | ||
| 5037 | * the array in read-only mode, so we check for that. | ||
| 4965 | */ | 5038 | */ |
| 4966 | sector_t here_new, here_old; | 5039 | sector_t here_new, here_old; |
| 4967 | int old_disks; | 5040 | int old_disks; |
| @@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev) | |||
| 4993 | /* here_old is the first stripe that we might need to read | 5066 | /* here_old is the first stripe that we might need to read |
| 4994 | * from */ | 5067 | * from */ |
| 4995 | if (mddev->delta_disks == 0) { | 5068 | if (mddev->delta_disks == 0) { |
| 5069 | if ((here_new * mddev->new_chunk_sectors != | ||
| 5070 | here_old * mddev->chunk_sectors)) { | ||
| 5071 | printk(KERN_ERR "md/raid:%s: reshape position is" | ||
| 5072 | " confused - aborting\n", mdname(mddev)); | ||
| 5073 | return -EINVAL; | ||
| 5074 | } | ||
| 4996 | /* We cannot be sure it is safe to start an in-place | 5075 | /* We cannot be sure it is safe to start an in-place |
| 4997 | * reshape. It is only safe if user-space if monitoring | 5076 | * reshape. It is only safe if user-space is monitoring |
| 4998 | * and taking constant backups. | 5077 | * and taking constant backups. |
| 4999 | * mdadm always starts a situation like this in | 5078 | * mdadm always starts a situation like this in |
| 5000 | * readonly mode so it can take control before | 5079 | * readonly mode so it can take control before |
| 5001 | * allowing any writes. So just check for that. | 5080 | * allowing any writes. So just check for that. |
| 5002 | */ | 5081 | */ |
| 5003 | if ((here_new * mddev->new_chunk_sectors != | 5082 | if (abs(min_offset_diff) >= mddev->chunk_sectors && |
| 5004 | here_old * mddev->chunk_sectors) || | 5083 | abs(min_offset_diff) >= mddev->new_chunk_sectors) |
| 5005 | mddev->ro == 0) { | 5084 | /* not really in-place - so OK */; |
| 5006 | printk(KERN_ERR "md/raid:%s: in-place reshape must be started" | 5085 | else if (mddev->ro == 0) { |
| 5007 | " in read-only mode - aborting\n", | 5086 | printk(KERN_ERR "md/raid:%s: in-place reshape " |
| 5087 | "must be started in read-only mode " | ||
| 5088 | "- aborting\n", | ||
| 5008 | mdname(mddev)); | 5089 | mdname(mddev)); |
| 5009 | return -EINVAL; | 5090 | return -EINVAL; |
| 5010 | } | 5091 | } |
| 5011 | } else if (mddev->delta_disks < 0 | 5092 | } else if (mddev->reshape_backwards |
| 5012 | ? (here_new * mddev->new_chunk_sectors <= | 5093 | ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= |
| 5013 | here_old * mddev->chunk_sectors) | 5094 | here_old * mddev->chunk_sectors) |
| 5014 | : (here_new * mddev->new_chunk_sectors >= | 5095 | : (here_new * mddev->new_chunk_sectors >= |
| 5015 | here_old * mddev->chunk_sectors)) { | 5096 | here_old * mddev->chunk_sectors + (-min_offset_diff))) { |
| 5016 | /* Reading from the same stripe as writing to - bad */ | 5097 | /* Reading from the same stripe as writing to - bad */ |
| 5017 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " | 5098 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " |
| 5018 | "auto-recovery - aborting.\n", | 5099 | "auto-recovery - aborting.\n", |
| @@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev) | |||
| 5037 | if (IS_ERR(conf)) | 5118 | if (IS_ERR(conf)) |
| 5038 | return PTR_ERR(conf); | 5119 | return PTR_ERR(conf); |
| 5039 | 5120 | ||
| 5121 | conf->min_offset_diff = min_offset_diff; | ||
| 5040 | mddev->thread = conf->thread; | 5122 | mddev->thread = conf->thread; |
| 5041 | conf->thread = NULL; | 5123 | conf->thread = NULL; |
| 5042 | mddev->private = conf; | 5124 | mddev->private = conf; |
| @@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev) | |||
| 5182 | blk_queue_io_opt(mddev->queue, chunk_size * | 5264 | blk_queue_io_opt(mddev->queue, chunk_size * |
| 5183 | (conf->raid_disks - conf->max_degraded)); | 5265 | (conf->raid_disks - conf->max_degraded)); |
| 5184 | 5266 | ||
| 5185 | rdev_for_each(rdev, mddev) | 5267 | rdev_for_each(rdev, mddev) { |
| 5186 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 5268 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
| 5187 | rdev->data_offset << 9); | 5269 | rdev->data_offset << 9); |
| 5270 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
| 5271 | rdev->new_data_offset << 9); | ||
| 5272 | } | ||
| 5188 | } | 5273 | } |
| 5189 | 5274 | ||
| 5190 | return 0; | 5275 | return 0; |
| @@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) | |||
| 5418 | * any io in the removed space completes, but it hardly seems | 5503 | * any io in the removed space completes, but it hardly seems |
| 5419 | * worth it. | 5504 | * worth it. |
| 5420 | */ | 5505 | */ |
| 5506 | sector_t newsize; | ||
| 5421 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); | 5507 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); |
| 5422 | md_set_array_sectors(mddev, raid5_size(mddev, sectors, | 5508 | newsize = raid5_size(mddev, sectors, mddev->raid_disks); |
| 5423 | mddev->raid_disks)); | 5509 | if (mddev->external_size && |
| 5424 | if (mddev->array_sectors > | 5510 | mddev->array_sectors > newsize) |
| 5425 | raid5_size(mddev, sectors, mddev->raid_disks)) | ||
| 5426 | return -EINVAL; | 5511 | return -EINVAL; |
| 5512 | if (mddev->bitmap) { | ||
| 5513 | int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); | ||
| 5514 | if (ret) | ||
| 5515 | return ret; | ||
| 5516 | } | ||
| 5517 | md_set_array_sectors(mddev, newsize); | ||
| 5427 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5518 | set_capacity(mddev->gendisk, mddev->array_sectors); |
| 5428 | revalidate_disk(mddev->gendisk); | 5519 | revalidate_disk(mddev->gendisk); |
| 5429 | if (sectors > mddev->dev_sectors && | 5520 | if (sectors > mddev->dev_sectors && |
| @@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev) | |||
| 5468 | mddev->new_layout == mddev->layout && | 5559 | mddev->new_layout == mddev->layout && |
| 5469 | mddev->new_chunk_sectors == mddev->chunk_sectors) | 5560 | mddev->new_chunk_sectors == mddev->chunk_sectors) |
| 5470 | return 0; /* nothing to do */ | 5561 | return 0; /* nothing to do */ |
| 5471 | if (mddev->bitmap) | ||
| 5472 | /* Cannot grow a bitmap yet */ | ||
| 5473 | return -EBUSY; | ||
| 5474 | if (has_failed(conf)) | 5562 | if (has_failed(conf)) |
| 5475 | return -EINVAL; | 5563 | return -EINVAL; |
| 5476 | if (mddev->delta_disks < 0) { | 5564 | if (mddev->delta_disks < 0) { |
| @@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 5505 | if (!check_stripe_cache(mddev)) | 5593 | if (!check_stripe_cache(mddev)) |
| 5506 | return -ENOSPC; | 5594 | return -ENOSPC; |
| 5507 | 5595 | ||
| 5508 | rdev_for_each(rdev, mddev) | 5596 | if (has_failed(conf)) |
| 5597 | return -EINVAL; | ||
| 5598 | |||
| 5599 | rdev_for_each(rdev, mddev) { | ||
| 5509 | if (!test_bit(In_sync, &rdev->flags) | 5600 | if (!test_bit(In_sync, &rdev->flags) |
| 5510 | && !test_bit(Faulty, &rdev->flags)) | 5601 | && !test_bit(Faulty, &rdev->flags)) |
| 5511 | spares++; | 5602 | spares++; |
| 5603 | } | ||
| 5512 | 5604 | ||
| 5513 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) | 5605 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) |
| 5514 | /* Not enough devices even to make a degraded array | 5606 | /* Not enough devices even to make a degraded array |
| @@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 5535 | conf->chunk_sectors = mddev->new_chunk_sectors; | 5627 | conf->chunk_sectors = mddev->new_chunk_sectors; |
| 5536 | conf->prev_algo = conf->algorithm; | 5628 | conf->prev_algo = conf->algorithm; |
| 5537 | conf->algorithm = mddev->new_layout; | 5629 | conf->algorithm = mddev->new_layout; |
| 5538 | if (mddev->delta_disks < 0) | 5630 | conf->generation++; |
| 5631 | /* Code that selects data_offset needs to see the generation update | ||
| 5632 | * if reshape_progress has been set - so a memory barrier needed. | ||
| 5633 | */ | ||
| 5634 | smp_mb(); | ||
| 5635 | if (mddev->reshape_backwards) | ||
| 5539 | conf->reshape_progress = raid5_size(mddev, 0, 0); | 5636 | conf->reshape_progress = raid5_size(mddev, 0, 0); |
| 5540 | else | 5637 | else |
| 5541 | conf->reshape_progress = 0; | 5638 | conf->reshape_progress = 0; |
| 5542 | conf->reshape_safe = conf->reshape_progress; | 5639 | conf->reshape_safe = conf->reshape_progress; |
| 5543 | conf->generation++; | ||
| 5544 | spin_unlock_irq(&conf->device_lock); | 5640 | spin_unlock_irq(&conf->device_lock); |
| 5545 | 5641 | ||
| 5546 | /* Add some new drives, as many as will fit. | 5642 | /* Add some new drives, as many as will fit. |
| @@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 5592 | mddev->recovery = 0; | 5688 | mddev->recovery = 0; |
| 5593 | spin_lock_irq(&conf->device_lock); | 5689 | spin_lock_irq(&conf->device_lock); |
| 5594 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | 5690 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; |
| 5691 | rdev_for_each(rdev, mddev) | ||
| 5692 | rdev->new_data_offset = rdev->data_offset; | ||
| 5693 | smp_wmb(); | ||
| 5595 | conf->reshape_progress = MaxSector; | 5694 | conf->reshape_progress = MaxSector; |
| 5596 | mddev->reshape_position = MaxSector; | 5695 | mddev->reshape_position = MaxSector; |
| 5597 | spin_unlock_irq(&conf->device_lock); | 5696 | spin_unlock_irq(&conf->device_lock); |
| @@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf) | |||
| 5610 | { | 5709 | { |
| 5611 | 5710 | ||
| 5612 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { | 5711 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { |
| 5712 | struct md_rdev *rdev; | ||
| 5613 | 5713 | ||
| 5614 | spin_lock_irq(&conf->device_lock); | 5714 | spin_lock_irq(&conf->device_lock); |
| 5615 | conf->previous_raid_disks = conf->raid_disks; | 5715 | conf->previous_raid_disks = conf->raid_disks; |
| 5716 | rdev_for_each(rdev, conf->mddev) | ||
| 5717 | rdev->data_offset = rdev->new_data_offset; | ||
| 5718 | smp_wmb(); | ||
| 5616 | conf->reshape_progress = MaxSector; | 5719 | conf->reshape_progress = MaxSector; |
| 5617 | spin_unlock_irq(&conf->device_lock); | 5720 | spin_unlock_irq(&conf->device_lock); |
| 5618 | wake_up(&conf->wait_for_overlap); | 5721 | wake_up(&conf->wait_for_overlap); |
| @@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev) | |||
| 5652 | d < conf->raid_disks - mddev->delta_disks; | 5755 | d < conf->raid_disks - mddev->delta_disks; |
| 5653 | d++) { | 5756 | d++) { |
| 5654 | struct md_rdev *rdev = conf->disks[d].rdev; | 5757 | struct md_rdev *rdev = conf->disks[d].rdev; |
| 5655 | if (rdev && | 5758 | if (rdev) |
| 5656 | raid5_remove_disk(mddev, rdev) == 0) { | 5759 | clear_bit(In_sync, &rdev->flags); |
| 5657 | sysfs_unlink_rdev(mddev, rdev); | 5760 | rdev = conf->disks[d].replacement; |
| 5658 | rdev->raid_disk = -1; | 5761 | if (rdev) |
| 5659 | } | 5762 | clear_bit(In_sync, &rdev->flags); |
| 5660 | } | 5763 | } |
| 5661 | } | 5764 | } |
| 5662 | mddev->layout = conf->algorithm; | 5765 | mddev->layout = conf->algorithm; |
| 5663 | mddev->chunk_sectors = conf->chunk_sectors; | 5766 | mddev->chunk_sectors = conf->chunk_sectors; |
| 5664 | mddev->reshape_position = MaxSector; | 5767 | mddev->reshape_position = MaxSector; |
| 5665 | mddev->delta_disks = 0; | 5768 | mddev->delta_disks = 0; |
| 5769 | mddev->reshape_backwards = 0; | ||
| 5666 | } | 5770 | } |
| 5667 | } | 5771 | } |
| 5668 | 5772 | ||
