aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c252
1 files changed, 178 insertions, 74 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f351422938e0..d26767246d26 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
488 return sh; 488 return sh;
489} 489}
490 490
491/* Determine if 'data_offset' or 'new_data_offset' should be used
492 * in this stripe_head.
493 */
494static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
495{
496 sector_t progress = conf->reshape_progress;
497 /* Need a memory barrier to make sure we see the value
498 * of conf->generation, or ->data_offset that was set before
499 * reshape_progress was updated.
500 */
501 smp_rmb();
502 if (progress == MaxSector)
503 return 0;
504 if (sh->generation == conf->generation - 1)
505 return 0;
506 /* We are in a reshape, and this is a new-generation stripe,
507 * so use new_data_offset.
508 */
509 return 1;
510}
511
491static void 512static void
492raid5_end_read_request(struct bio *bi, int error); 513raid5_end_read_request(struct bio *bi, int error);
493static void 514static void
@@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
518 replace_only = 1; 539 replace_only = 1;
519 } else 540 } else
520 continue; 541 continue;
542 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
543 rw |= REQ_SYNC;
521 544
522 bi = &sh->dev[i].req; 545 bi = &sh->dev[i].req;
523 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 546 rbi = &sh->dev[i].rreq; /* For writing to replacement */
@@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
603 __func__, (unsigned long long)sh->sector, 626 __func__, (unsigned long long)sh->sector,
604 bi->bi_rw, i); 627 bi->bi_rw, i);
605 atomic_inc(&sh->count); 628 atomic_inc(&sh->count);
606 bi->bi_sector = sh->sector + rdev->data_offset; 629 if (use_new_offset(conf, sh))
630 bi->bi_sector = (sh->sector
631 + rdev->new_data_offset);
632 else
633 bi->bi_sector = (sh->sector
634 + rdev->data_offset);
607 bi->bi_flags = 1 << BIO_UPTODATE; 635 bi->bi_flags = 1 << BIO_UPTODATE;
608 bi->bi_idx = 0; 636 bi->bi_idx = 0;
609 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 637 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
627 __func__, (unsigned long long)sh->sector, 655 __func__, (unsigned long long)sh->sector,
628 rbi->bi_rw, i); 656 rbi->bi_rw, i);
629 atomic_inc(&sh->count); 657 atomic_inc(&sh->count);
630 rbi->bi_sector = sh->sector + rrdev->data_offset; 658 if (use_new_offset(conf, sh))
659 rbi->bi_sector = (sh->sector
660 + rrdev->new_data_offset);
661 else
662 rbi->bi_sector = (sh->sector
663 + rrdev->data_offset);
631 rbi->bi_flags = 1 << BIO_UPTODATE; 664 rbi->bi_flags = 1 << BIO_UPTODATE;
632 rbi->bi_idx = 0; 665 rbi->bi_idx = 0;
633 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 666 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1114 dev->sector + STRIPE_SECTORS) { 1147 dev->sector + STRIPE_SECTORS) {
1115 if (wbi->bi_rw & REQ_FUA) 1148 if (wbi->bi_rw & REQ_FUA)
1116 set_bit(R5_WantFUA, &dev->flags); 1149 set_bit(R5_WantFUA, &dev->flags);
1150 if (wbi->bi_rw & REQ_SYNC)
1151 set_bit(R5_SyncIO, &dev->flags);
1117 tx = async_copy_data(1, wbi, dev->page, 1152 tx = async_copy_data(1, wbi, dev->page,
1118 dev->sector, tx); 1153 dev->sector, tx);
1119 wbi = r5_next_bio(wbi, dev->sector); 1154 wbi = r5_next_bio(wbi, dev->sector);
@@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1131 int pd_idx = sh->pd_idx; 1166 int pd_idx = sh->pd_idx;
1132 int qd_idx = sh->qd_idx; 1167 int qd_idx = sh->qd_idx;
1133 int i; 1168 int i;
1134 bool fua = false; 1169 bool fua = false, sync = false;
1135 1170
1136 pr_debug("%s: stripe %llu\n", __func__, 1171 pr_debug("%s: stripe %llu\n", __func__,
1137 (unsigned long long)sh->sector); 1172 (unsigned long long)sh->sector);
1138 1173
1139 for (i = disks; i--; ) 1174 for (i = disks; i--; ) {
1140 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1175 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1176 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1177 }
1141 1178
1142 for (i = disks; i--; ) { 1179 for (i = disks; i--; ) {
1143 struct r5dev *dev = &sh->dev[i]; 1180 struct r5dev *dev = &sh->dev[i];
@@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1146 set_bit(R5_UPTODATE, &dev->flags); 1183 set_bit(R5_UPTODATE, &dev->flags);
1147 if (fua) 1184 if (fua)
1148 set_bit(R5_WantFUA, &dev->flags); 1185 set_bit(R5_WantFUA, &dev->flags);
1186 if (sync)
1187 set_bit(R5_SyncIO, &dev->flags);
1149 } 1188 }
1150 } 1189 }
1151 1190
@@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1648 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1687 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1649 char b[BDEVNAME_SIZE]; 1688 char b[BDEVNAME_SIZE];
1650 struct md_rdev *rdev = NULL; 1689 struct md_rdev *rdev = NULL;
1651 1690 sector_t s;
1652 1691
1653 for (i=0 ; i<disks; i++) 1692 for (i=0 ; i<disks; i++)
1654 if (bi == &sh->dev[i].req) 1693 if (bi == &sh->dev[i].req)
@@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error)
1671 if (!rdev) 1710 if (!rdev)
1672 rdev = conf->disks[i].rdev; 1711 rdev = conf->disks[i].rdev;
1673 1712
1713 if (use_new_offset(conf, sh))
1714 s = sh->sector + rdev->new_data_offset;
1715 else
1716 s = sh->sector + rdev->data_offset;
1674 if (uptodate) { 1717 if (uptodate) {
1675 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1718 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1676 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1719 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
@@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1683 "md/raid:%s: read error corrected" 1726 "md/raid:%s: read error corrected"
1684 " (%lu sectors at %llu on %s)\n", 1727 " (%lu sectors at %llu on %s)\n",
1685 mdname(conf->mddev), STRIPE_SECTORS, 1728 mdname(conf->mddev), STRIPE_SECTORS,
1686 (unsigned long long)(sh->sector 1729 (unsigned long long)s,
1687 + rdev->data_offset),
1688 bdevname(rdev->bdev, b)); 1730 bdevname(rdev->bdev, b));
1689 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1731 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1690 clear_bit(R5_ReadError, &sh->dev[i].flags); 1732 clear_bit(R5_ReadError, &sh->dev[i].flags);
@@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1704 "md/raid:%s: read error on replacement device " 1746 "md/raid:%s: read error on replacement device "
1705 "(sector %llu on %s).\n", 1747 "(sector %llu on %s).\n",
1706 mdname(conf->mddev), 1748 mdname(conf->mddev),
1707 (unsigned long long)(sh->sector 1749 (unsigned long long)s,
1708 + rdev->data_offset),
1709 bdn); 1750 bdn);
1710 else if (conf->mddev->degraded >= conf->max_degraded) 1751 else if (conf->mddev->degraded >= conf->max_degraded)
1711 printk_ratelimited( 1752 printk_ratelimited(
@@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1713 "md/raid:%s: read error not correctable " 1754 "md/raid:%s: read error not correctable "
1714 "(sector %llu on %s).\n", 1755 "(sector %llu on %s).\n",
1715 mdname(conf->mddev), 1756 mdname(conf->mddev),
1716 (unsigned long long)(sh->sector 1757 (unsigned long long)s,
1717 + rdev->data_offset),
1718 bdn); 1758 bdn);
1719 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1759 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1720 /* Oh, no!!! */ 1760 /* Oh, no!!! */
@@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1723 "md/raid:%s: read error NOT corrected!! " 1763 "md/raid:%s: read error NOT corrected!! "
1724 "(sector %llu on %s).\n", 1764 "(sector %llu on %s).\n",
1725 mdname(conf->mddev), 1765 mdname(conf->mddev),
1726 (unsigned long long)(sh->sector 1766 (unsigned long long)s,
1727 + rdev->data_offset),
1728 bdn); 1767 bdn);
1729 else if (atomic_read(&rdev->read_errors) 1768 else if (atomic_read(&rdev->read_errors)
1730 > conf->max_nr_stripes) 1769 > conf->max_nr_stripes)
@@ -3561,7 +3600,7 @@ finish:
3561 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3600 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3562 rdev = conf->disks[i].rdev; 3601 rdev = conf->disks[i].rdev;
3563 rdev_clear_badblocks(rdev, sh->sector, 3602 rdev_clear_badblocks(rdev, sh->sector,
3564 STRIPE_SECTORS); 3603 STRIPE_SECTORS, 0);
3565 rdev_dec_pending(rdev, conf->mddev); 3604 rdev_dec_pending(rdev, conf->mddev);
3566 } 3605 }
3567 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3606 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
@@ -3570,7 +3609,7 @@ finish:
3570 /* rdev have been moved down */ 3609 /* rdev have been moved down */
3571 rdev = conf->disks[i].rdev; 3610 rdev = conf->disks[i].rdev;
3572 rdev_clear_badblocks(rdev, sh->sector, 3611 rdev_clear_badblocks(rdev, sh->sector,
3573 STRIPE_SECTORS); 3612 STRIPE_SECTORS, 0);
3574 rdev_dec_pending(rdev, conf->mddev); 3613 rdev_dec_pending(rdev, conf->mddev);
3575 } 3614 }
3576 } 3615 }
@@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3842 raid_bio->bi_next = (void*)rdev; 3881 raid_bio->bi_next = (void*)rdev;
3843 align_bi->bi_bdev = rdev->bdev; 3882 align_bi->bi_bdev = rdev->bdev;
3844 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3883 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3884 /* No reshape active, so we can trust rdev->data_offset */
3845 align_bi->bi_sector += rdev->data_offset; 3885 align_bi->bi_sector += rdev->data_offset;
3846 3886
3847 if (!bio_fits_rdev(align_bi) || 3887 if (!bio_fits_rdev(align_bi) ||
@@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
3953 plugged = mddev_check_plugged(mddev); 3993 plugged = mddev_check_plugged(mddev);
3954 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3994 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3955 DEFINE_WAIT(w); 3995 DEFINE_WAIT(w);
3956 int disks, data_disks;
3957 int previous; 3996 int previous;
3958 3997
3959 retry: 3998 retry:
3960 previous = 0; 3999 previous = 0;
3961 disks = conf->raid_disks;
3962 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4000 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3963 if (unlikely(conf->reshape_progress != MaxSector)) { 4001 if (unlikely(conf->reshape_progress != MaxSector)) {
3964 /* spinlock is needed as reshape_progress may be 4002 /* spinlock is needed as reshape_progress may be
@@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
3970 * to check again. 4008 * to check again.
3971 */ 4009 */
3972 spin_lock_irq(&conf->device_lock); 4010 spin_lock_irq(&conf->device_lock);
3973 if (mddev->delta_disks < 0 4011 if (mddev->reshape_backwards
3974 ? logical_sector < conf->reshape_progress 4012 ? logical_sector < conf->reshape_progress
3975 : logical_sector >= conf->reshape_progress) { 4013 : logical_sector >= conf->reshape_progress) {
3976 disks = conf->previous_raid_disks;
3977 previous = 1; 4014 previous = 1;
3978 } else { 4015 } else {
3979 if (mddev->delta_disks < 0 4016 if (mddev->reshape_backwards
3980 ? logical_sector < conf->reshape_safe 4017 ? logical_sector < conf->reshape_safe
3981 : logical_sector >= conf->reshape_safe) { 4018 : logical_sector >= conf->reshape_safe) {
3982 spin_unlock_irq(&conf->device_lock); 4019 spin_unlock_irq(&conf->device_lock);
@@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
3986 } 4023 }
3987 spin_unlock_irq(&conf->device_lock); 4024 spin_unlock_irq(&conf->device_lock);
3988 } 4025 }
3989 data_disks = disks - conf->max_degraded;
3990 4026
3991 new_sector = raid5_compute_sector(conf, logical_sector, 4027 new_sector = raid5_compute_sector(conf, logical_sector,
3992 previous, 4028 previous,
@@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4009 */ 4045 */
4010 int must_retry = 0; 4046 int must_retry = 0;
4011 spin_lock_irq(&conf->device_lock); 4047 spin_lock_irq(&conf->device_lock);
4012 if (mddev->delta_disks < 0 4048 if (mddev->reshape_backwards
4013 ? logical_sector >= conf->reshape_progress 4049 ? logical_sector >= conf->reshape_progress
4014 : logical_sector < conf->reshape_progress) 4050 : logical_sector < conf->reshape_progress)
4015 /* mismatch, need to try again */ 4051 /* mismatch, need to try again */
@@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4108 4144
4109 if (sector_nr == 0) { 4145 if (sector_nr == 0) {
4110 /* If restarting in the middle, skip the initial sectors */ 4146 /* If restarting in the middle, skip the initial sectors */
4111 if (mddev->delta_disks < 0 && 4147 if (mddev->reshape_backwards &&
4112 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4148 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
4113 sector_nr = raid5_size(mddev, 0, 0) 4149 sector_nr = raid5_size(mddev, 0, 0)
4114 - conf->reshape_progress; 4150 - conf->reshape_progress;
4115 } else if (mddev->delta_disks >= 0 && 4151 } else if (!mddev->reshape_backwards &&
4116 conf->reshape_progress > 0) 4152 conf->reshape_progress > 0)
4117 sector_nr = conf->reshape_progress; 4153 sector_nr = conf->reshape_progress;
4118 sector_div(sector_nr, new_data_disks); 4154 sector_div(sector_nr, new_data_disks);
@@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4133 else 4169 else
4134 reshape_sectors = mddev->chunk_sectors; 4170 reshape_sectors = mddev->chunk_sectors;
4135 4171
4136 /* we update the metadata when there is more than 3Meg 4172 /* We update the metadata at least every 10 seconds, or when
4137 * in the block range (that is rather arbitrary, should 4173 * the data about to be copied would over-write the source of
4138 * probably be time based) or when the data about to be 4174 * the data at the front of the range. i.e. one new_stripe
4139 * copied would over-write the source of the data at 4175 * along from reshape_progress new_maps to after where
4140 * the front of the range. 4176 * reshape_safe old_maps to
4141 * i.e. one new_stripe along from reshape_progress new_maps
4142 * to after where reshape_safe old_maps to
4143 */ 4177 */
4144 writepos = conf->reshape_progress; 4178 writepos = conf->reshape_progress;
4145 sector_div(writepos, new_data_disks); 4179 sector_div(writepos, new_data_disks);
@@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4147 sector_div(readpos, data_disks); 4181 sector_div(readpos, data_disks);
4148 safepos = conf->reshape_safe; 4182 safepos = conf->reshape_safe;
4149 sector_div(safepos, data_disks); 4183 sector_div(safepos, data_disks);
4150 if (mddev->delta_disks < 0) { 4184 if (mddev->reshape_backwards) {
4151 writepos -= min_t(sector_t, reshape_sectors, writepos); 4185 writepos -= min_t(sector_t, reshape_sectors, writepos);
4152 readpos += reshape_sectors; 4186 readpos += reshape_sectors;
4153 safepos += reshape_sectors; 4187 safepos += reshape_sectors;
@@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4157 safepos -= min_t(sector_t, reshape_sectors, safepos); 4191 safepos -= min_t(sector_t, reshape_sectors, safepos);
4158 } 4192 }
4159 4193
4194 /* Having calculated the 'writepos' possibly use it
4195 * to set 'stripe_addr' which is where we will write to.
4196 */
4197 if (mddev->reshape_backwards) {
4198 BUG_ON(conf->reshape_progress == 0);
4199 stripe_addr = writepos;
4200 BUG_ON((mddev->dev_sectors &
4201 ~((sector_t)reshape_sectors - 1))
4202 - reshape_sectors - stripe_addr
4203 != sector_nr);
4204 } else {
4205 BUG_ON(writepos != sector_nr + reshape_sectors);
4206 stripe_addr = sector_nr;
4207 }
4208
4160 /* 'writepos' is the most advanced device address we might write. 4209 /* 'writepos' is the most advanced device address we might write.
4161 * 'readpos' is the least advanced device address we might read. 4210 * 'readpos' is the least advanced device address we might read.
4162 * 'safepos' is the least address recorded in the metadata as having 4211 * 'safepos' is the least address recorded in the metadata as having
4163 * been reshaped. 4212 * been reshaped.
4164 * If 'readpos' is behind 'writepos', then there is no way that we can 4213 * If there is a min_offset_diff, these are adjusted either by
4214 * increasing the safepos/readpos if diff is negative, or
4215 * increasing writepos if diff is positive.
4216 * If 'readpos' is then behind 'writepos', there is no way that we can
4165 * ensure safety in the face of a crash - that must be done by userspace 4217 * ensure safety in the face of a crash - that must be done by userspace
4166 * making a backup of the data. So in that case there is no particular 4218 * making a backup of the data. So in that case there is no particular
4167 * rush to update metadata. 4219 * rush to update metadata.
@@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4174 * Maybe that number should be configurable, but I'm not sure it is 4226 * Maybe that number should be configurable, but I'm not sure it is
4175 * worth it.... maybe it could be a multiple of safemode_delay??? 4227 * worth it.... maybe it could be a multiple of safemode_delay???
4176 */ 4228 */
4177 if ((mddev->delta_disks < 0 4229 if (conf->min_offset_diff < 0) {
4230 safepos += -conf->min_offset_diff;
4231 readpos += -conf->min_offset_diff;
4232 } else
4233 writepos += conf->min_offset_diff;
4234
4235 if ((mddev->reshape_backwards
4178 ? (safepos > writepos && readpos < writepos) 4236 ? (safepos > writepos && readpos < writepos)
4179 : (safepos < writepos && readpos > writepos)) || 4237 : (safepos < writepos && readpos > writepos)) ||
4180 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4238 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
@@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4195 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4253 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4196 } 4254 }
4197 4255
4198 if (mddev->delta_disks < 0) {
4199 BUG_ON(conf->reshape_progress == 0);
4200 stripe_addr = writepos;
4201 BUG_ON((mddev->dev_sectors &
4202 ~((sector_t)reshape_sectors - 1))
4203 - reshape_sectors - stripe_addr
4204 != sector_nr);
4205 } else {
4206 BUG_ON(writepos != sector_nr + reshape_sectors);
4207 stripe_addr = sector_nr;
4208 }
4209 INIT_LIST_HEAD(&stripes); 4256 INIT_LIST_HEAD(&stripes);
4210 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4257 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4211 int j; 4258 int j;
@@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4239 list_add(&sh->lru, &stripes); 4286 list_add(&sh->lru, &stripes);
4240 } 4287 }
4241 spin_lock_irq(&conf->device_lock); 4288 spin_lock_irq(&conf->device_lock);
4242 if (mddev->delta_disks < 0) 4289 if (mddev->reshape_backwards)
4243 conf->reshape_progress -= reshape_sectors * new_data_disks; 4290 conf->reshape_progress -= reshape_sectors * new_data_disks;
4244 else 4291 else
4245 conf->reshape_progress += reshape_sectors * new_data_disks; 4292 conf->reshape_progress += reshape_sectors * new_data_disks;
@@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev)
4952 struct md_rdev *rdev; 4999 struct md_rdev *rdev;
4953 sector_t reshape_offset = 0; 5000 sector_t reshape_offset = 0;
4954 int i; 5001 int i;
5002 long long min_offset_diff = 0;
5003 int first = 1;
4955 5004
4956 if (mddev->recovery_cp != MaxSector) 5005 if (mddev->recovery_cp != MaxSector)
4957 printk(KERN_NOTICE "md/raid:%s: not clean" 5006 printk(KERN_NOTICE "md/raid:%s: not clean"
4958 " -- starting background reconstruction\n", 5007 " -- starting background reconstruction\n",
4959 mdname(mddev)); 5008 mdname(mddev));
5009
5010 rdev_for_each(rdev, mddev) {
5011 long long diff;
5012 if (rdev->raid_disk < 0)
5013 continue;
5014 diff = (rdev->new_data_offset - rdev->data_offset);
5015 if (first) {
5016 min_offset_diff = diff;
5017 first = 0;
5018 } else if (mddev->reshape_backwards &&
5019 diff < min_offset_diff)
5020 min_offset_diff = diff;
5021 else if (!mddev->reshape_backwards &&
5022 diff > min_offset_diff)
5023 min_offset_diff = diff;
5024 }
5025
4960 if (mddev->reshape_position != MaxSector) { 5026 if (mddev->reshape_position != MaxSector) {
4961 /* Check that we can continue the reshape. 5027 /* Check that we can continue the reshape.
4962 * Currently only disks can change, it must 5028 * Difficulties arise if the stripe we would write to
4963 * increase, and we must be past the point where 5029 * next is at or after the stripe we would read from next.
4964 * a stripe over-writes itself 5030 * For a reshape that changes the number of devices, this
5031 * is only possible for a very short time, and mdadm makes
5032 * sure that time appears to have past before assembling
5033 * the array. So we fail if that time hasn't passed.
5034 * For a reshape that keeps the number of devices the same
5035 * mdadm must be monitoring the reshape can keeping the
5036 * critical areas read-only and backed up. It will start
5037 * the array in read-only mode, so we check for that.
4965 */ 5038 */
4966 sector_t here_new, here_old; 5039 sector_t here_new, here_old;
4967 int old_disks; 5040 int old_disks;
@@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev)
4993 /* here_old is the first stripe that we might need to read 5066 /* here_old is the first stripe that we might need to read
4994 * from */ 5067 * from */
4995 if (mddev->delta_disks == 0) { 5068 if (mddev->delta_disks == 0) {
5069 if ((here_new * mddev->new_chunk_sectors !=
5070 here_old * mddev->chunk_sectors)) {
5071 printk(KERN_ERR "md/raid:%s: reshape position is"
5072 " confused - aborting\n", mdname(mddev));
5073 return -EINVAL;
5074 }
4996 /* We cannot be sure it is safe to start an in-place 5075 /* We cannot be sure it is safe to start an in-place
4997 * reshape. It is only safe if user-space if monitoring 5076 * reshape. It is only safe if user-space is monitoring
4998 * and taking constant backups. 5077 * and taking constant backups.
4999 * mdadm always starts a situation like this in 5078 * mdadm always starts a situation like this in
5000 * readonly mode so it can take control before 5079 * readonly mode so it can take control before
5001 * allowing any writes. So just check for that. 5080 * allowing any writes. So just check for that.
5002 */ 5081 */
5003 if ((here_new * mddev->new_chunk_sectors != 5082 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
5004 here_old * mddev->chunk_sectors) || 5083 abs(min_offset_diff) >= mddev->new_chunk_sectors)
5005 mddev->ro == 0) { 5084 /* not really in-place - so OK */;
5006 printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 5085 else if (mddev->ro == 0) {
5007 " in read-only mode - aborting\n", 5086 printk(KERN_ERR "md/raid:%s: in-place reshape "
5087 "must be started in read-only mode "
5088 "- aborting\n",
5008 mdname(mddev)); 5089 mdname(mddev));
5009 return -EINVAL; 5090 return -EINVAL;
5010 } 5091 }
5011 } else if (mddev->delta_disks < 0 5092 } else if (mddev->reshape_backwards
5012 ? (here_new * mddev->new_chunk_sectors <= 5093 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
5013 here_old * mddev->chunk_sectors) 5094 here_old * mddev->chunk_sectors)
5014 : (here_new * mddev->new_chunk_sectors >= 5095 : (here_new * mddev->new_chunk_sectors >=
5015 here_old * mddev->chunk_sectors)) { 5096 here_old * mddev->chunk_sectors + (-min_offset_diff))) {
5016 /* Reading from the same stripe as writing to - bad */ 5097 /* Reading from the same stripe as writing to - bad */
5017 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5098 printk(KERN_ERR "md/raid:%s: reshape_position too early for "
5018 "auto-recovery - aborting.\n", 5099 "auto-recovery - aborting.\n",
@@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev)
5037 if (IS_ERR(conf)) 5118 if (IS_ERR(conf))
5038 return PTR_ERR(conf); 5119 return PTR_ERR(conf);
5039 5120
5121 conf->min_offset_diff = min_offset_diff;
5040 mddev->thread = conf->thread; 5122 mddev->thread = conf->thread;
5041 conf->thread = NULL; 5123 conf->thread = NULL;
5042 mddev->private = conf; 5124 mddev->private = conf;
@@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev)
5182 blk_queue_io_opt(mddev->queue, chunk_size * 5264 blk_queue_io_opt(mddev->queue, chunk_size *
5183 (conf->raid_disks - conf->max_degraded)); 5265 (conf->raid_disks - conf->max_degraded));
5184 5266
5185 rdev_for_each(rdev, mddev) 5267 rdev_for_each(rdev, mddev) {
5186 disk_stack_limits(mddev->gendisk, rdev->bdev, 5268 disk_stack_limits(mddev->gendisk, rdev->bdev,
5187 rdev->data_offset << 9); 5269 rdev->data_offset << 9);
5270 disk_stack_limits(mddev->gendisk, rdev->bdev,
5271 rdev->new_data_offset << 9);
5272 }
5188 } 5273 }
5189 5274
5190 return 0; 5275 return 0;
@@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
5418 * any io in the removed space completes, but it hardly seems 5503 * any io in the removed space completes, but it hardly seems
5419 * worth it. 5504 * worth it.
5420 */ 5505 */
5506 sector_t newsize;
5421 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5507 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
5422 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5508 newsize = raid5_size(mddev, sectors, mddev->raid_disks);
5423 mddev->raid_disks)); 5509 if (mddev->external_size &&
5424 if (mddev->array_sectors > 5510 mddev->array_sectors > newsize)
5425 raid5_size(mddev, sectors, mddev->raid_disks))
5426 return -EINVAL; 5511 return -EINVAL;
5512 if (mddev->bitmap) {
5513 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
5514 if (ret)
5515 return ret;
5516 }
5517 md_set_array_sectors(mddev, newsize);
5427 set_capacity(mddev->gendisk, mddev->array_sectors); 5518 set_capacity(mddev->gendisk, mddev->array_sectors);
5428 revalidate_disk(mddev->gendisk); 5519 revalidate_disk(mddev->gendisk);
5429 if (sectors > mddev->dev_sectors && 5520 if (sectors > mddev->dev_sectors &&
@@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev)
5468 mddev->new_layout == mddev->layout && 5559 mddev->new_layout == mddev->layout &&
5469 mddev->new_chunk_sectors == mddev->chunk_sectors) 5560 mddev->new_chunk_sectors == mddev->chunk_sectors)
5470 return 0; /* nothing to do */ 5561 return 0; /* nothing to do */
5471 if (mddev->bitmap)
5472 /* Cannot grow a bitmap yet */
5473 return -EBUSY;
5474 if (has_failed(conf)) 5562 if (has_failed(conf))
5475 return -EINVAL; 5563 return -EINVAL;
5476 if (mddev->delta_disks < 0) { 5564 if (mddev->delta_disks < 0) {
@@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev)
5505 if (!check_stripe_cache(mddev)) 5593 if (!check_stripe_cache(mddev))
5506 return -ENOSPC; 5594 return -ENOSPC;
5507 5595
5508 rdev_for_each(rdev, mddev) 5596 if (has_failed(conf))
5597 return -EINVAL;
5598
5599 rdev_for_each(rdev, mddev) {
5509 if (!test_bit(In_sync, &rdev->flags) 5600 if (!test_bit(In_sync, &rdev->flags)
5510 && !test_bit(Faulty, &rdev->flags)) 5601 && !test_bit(Faulty, &rdev->flags))
5511 spares++; 5602 spares++;
5603 }
5512 5604
5513 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5605 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
5514 /* Not enough devices even to make a degraded array 5606 /* Not enough devices even to make a degraded array
@@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev)
5535 conf->chunk_sectors = mddev->new_chunk_sectors; 5627 conf->chunk_sectors = mddev->new_chunk_sectors;
5536 conf->prev_algo = conf->algorithm; 5628 conf->prev_algo = conf->algorithm;
5537 conf->algorithm = mddev->new_layout; 5629 conf->algorithm = mddev->new_layout;
5538 if (mddev->delta_disks < 0) 5630 conf->generation++;
5631 /* Code that selects data_offset needs to see the generation update
5632 * if reshape_progress has been set - so a memory barrier needed.
5633 */
5634 smp_mb();
5635 if (mddev->reshape_backwards)
5539 conf->reshape_progress = raid5_size(mddev, 0, 0); 5636 conf->reshape_progress = raid5_size(mddev, 0, 0);
5540 else 5637 else
5541 conf->reshape_progress = 0; 5638 conf->reshape_progress = 0;
5542 conf->reshape_safe = conf->reshape_progress; 5639 conf->reshape_safe = conf->reshape_progress;
5543 conf->generation++;
5544 spin_unlock_irq(&conf->device_lock); 5640 spin_unlock_irq(&conf->device_lock);
5545 5641
5546 /* Add some new drives, as many as will fit. 5642 /* Add some new drives, as many as will fit.
@@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev)
5592 mddev->recovery = 0; 5688 mddev->recovery = 0;
5593 spin_lock_irq(&conf->device_lock); 5689 spin_lock_irq(&conf->device_lock);
5594 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5690 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
5691 rdev_for_each(rdev, mddev)
5692 rdev->new_data_offset = rdev->data_offset;
5693 smp_wmb();
5595 conf->reshape_progress = MaxSector; 5694 conf->reshape_progress = MaxSector;
5596 mddev->reshape_position = MaxSector; 5695 mddev->reshape_position = MaxSector;
5597 spin_unlock_irq(&conf->device_lock); 5696 spin_unlock_irq(&conf->device_lock);
@@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf)
5610{ 5709{
5611 5710
5612 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5711 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
5712 struct md_rdev *rdev;
5613 5713
5614 spin_lock_irq(&conf->device_lock); 5714 spin_lock_irq(&conf->device_lock);
5615 conf->previous_raid_disks = conf->raid_disks; 5715 conf->previous_raid_disks = conf->raid_disks;
5716 rdev_for_each(rdev, conf->mddev)
5717 rdev->data_offset = rdev->new_data_offset;
5718 smp_wmb();
5616 conf->reshape_progress = MaxSector; 5719 conf->reshape_progress = MaxSector;
5617 spin_unlock_irq(&conf->device_lock); 5720 spin_unlock_irq(&conf->device_lock);
5618 wake_up(&conf->wait_for_overlap); 5721 wake_up(&conf->wait_for_overlap);
@@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev)
5652 d < conf->raid_disks - mddev->delta_disks; 5755 d < conf->raid_disks - mddev->delta_disks;
5653 d++) { 5756 d++) {
5654 struct md_rdev *rdev = conf->disks[d].rdev; 5757 struct md_rdev *rdev = conf->disks[d].rdev;
5655 if (rdev && 5758 if (rdev)
5656 raid5_remove_disk(mddev, rdev) == 0) { 5759 clear_bit(In_sync, &rdev->flags);
5657 sysfs_unlink_rdev(mddev, rdev); 5760 rdev = conf->disks[d].replacement;
5658 rdev->raid_disk = -1; 5761 if (rdev)
5659 } 5762 clear_bit(In_sync, &rdev->flags);
5660 } 5763 }
5661 } 5764 }
5662 mddev->layout = conf->algorithm; 5765 mddev->layout = conf->algorithm;
5663 mddev->chunk_sectors = conf->chunk_sectors; 5766 mddev->chunk_sectors = conf->chunk_sectors;
5664 mddev->reshape_position = MaxSector; 5767 mddev->reshape_position = MaxSector;
5665 mddev->delta_disks = 0; 5768 mddev->delta_disks = 0;
5769 mddev->reshape_backwards = 0;
5666 } 5770 }
5667} 5771}
5668 5772