diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 252 |
1 files changed, 178 insertions, 74 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f351422938e0..d26767246d26 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
488 | return sh; | 488 | return sh; |
489 | } | 489 | } |
490 | 490 | ||
491 | /* Determine if 'data_offset' or 'new_data_offset' should be used | ||
492 | * in this stripe_head. | ||
493 | */ | ||
494 | static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) | ||
495 | { | ||
496 | sector_t progress = conf->reshape_progress; | ||
497 | /* Need a memory barrier to make sure we see the value | ||
498 | * of conf->generation, or ->data_offset that was set before | ||
499 | * reshape_progress was updated. | ||
500 | */ | ||
501 | smp_rmb(); | ||
502 | if (progress == MaxSector) | ||
503 | return 0; | ||
504 | if (sh->generation == conf->generation - 1) | ||
505 | return 0; | ||
506 | /* We are in a reshape, and this is a new-generation stripe, | ||
507 | * so use new_data_offset. | ||
508 | */ | ||
509 | return 1; | ||
510 | } | ||
511 | |||
491 | static void | 512 | static void |
492 | raid5_end_read_request(struct bio *bi, int error); | 513 | raid5_end_read_request(struct bio *bi, int error); |
493 | static void | 514 | static void |
@@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
518 | replace_only = 1; | 539 | replace_only = 1; |
519 | } else | 540 | } else |
520 | continue; | 541 | continue; |
542 | if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) | ||
543 | rw |= REQ_SYNC; | ||
521 | 544 | ||
522 | bi = &sh->dev[i].req; | 545 | bi = &sh->dev[i].req; |
523 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ | 546 | rbi = &sh->dev[i].rreq; /* For writing to replacement */ |
@@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
603 | __func__, (unsigned long long)sh->sector, | 626 | __func__, (unsigned long long)sh->sector, |
604 | bi->bi_rw, i); | 627 | bi->bi_rw, i); |
605 | atomic_inc(&sh->count); | 628 | atomic_inc(&sh->count); |
606 | bi->bi_sector = sh->sector + rdev->data_offset; | 629 | if (use_new_offset(conf, sh)) |
630 | bi->bi_sector = (sh->sector | ||
631 | + rdev->new_data_offset); | ||
632 | else | ||
633 | bi->bi_sector = (sh->sector | ||
634 | + rdev->data_offset); | ||
607 | bi->bi_flags = 1 << BIO_UPTODATE; | 635 | bi->bi_flags = 1 << BIO_UPTODATE; |
608 | bi->bi_idx = 0; | 636 | bi->bi_idx = 0; |
609 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 637 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
@@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
627 | __func__, (unsigned long long)sh->sector, | 655 | __func__, (unsigned long long)sh->sector, |
628 | rbi->bi_rw, i); | 656 | rbi->bi_rw, i); |
629 | atomic_inc(&sh->count); | 657 | atomic_inc(&sh->count); |
630 | rbi->bi_sector = sh->sector + rrdev->data_offset; | 658 | if (use_new_offset(conf, sh)) |
659 | rbi->bi_sector = (sh->sector | ||
660 | + rrdev->new_data_offset); | ||
661 | else | ||
662 | rbi->bi_sector = (sh->sector | ||
663 | + rrdev->data_offset); | ||
631 | rbi->bi_flags = 1 << BIO_UPTODATE; | 664 | rbi->bi_flags = 1 << BIO_UPTODATE; |
632 | rbi->bi_idx = 0; | 665 | rbi->bi_idx = 0; |
633 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 666 | rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
@@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1114 | dev->sector + STRIPE_SECTORS) { | 1147 | dev->sector + STRIPE_SECTORS) { |
1115 | if (wbi->bi_rw & REQ_FUA) | 1148 | if (wbi->bi_rw & REQ_FUA) |
1116 | set_bit(R5_WantFUA, &dev->flags); | 1149 | set_bit(R5_WantFUA, &dev->flags); |
1150 | if (wbi->bi_rw & REQ_SYNC) | ||
1151 | set_bit(R5_SyncIO, &dev->flags); | ||
1117 | tx = async_copy_data(1, wbi, dev->page, | 1152 | tx = async_copy_data(1, wbi, dev->page, |
1118 | dev->sector, tx); | 1153 | dev->sector, tx); |
1119 | wbi = r5_next_bio(wbi, dev->sector); | 1154 | wbi = r5_next_bio(wbi, dev->sector); |
@@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
1131 | int pd_idx = sh->pd_idx; | 1166 | int pd_idx = sh->pd_idx; |
1132 | int qd_idx = sh->qd_idx; | 1167 | int qd_idx = sh->qd_idx; |
1133 | int i; | 1168 | int i; |
1134 | bool fua = false; | 1169 | bool fua = false, sync = false; |
1135 | 1170 | ||
1136 | pr_debug("%s: stripe %llu\n", __func__, | 1171 | pr_debug("%s: stripe %llu\n", __func__, |
1137 | (unsigned long long)sh->sector); | 1172 | (unsigned long long)sh->sector); |
1138 | 1173 | ||
1139 | for (i = disks; i--; ) | 1174 | for (i = disks; i--; ) { |
1140 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); | 1175 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); |
1176 | sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); | ||
1177 | } | ||
1141 | 1178 | ||
1142 | for (i = disks; i--; ) { | 1179 | for (i = disks; i--; ) { |
1143 | struct r5dev *dev = &sh->dev[i]; | 1180 | struct r5dev *dev = &sh->dev[i]; |
@@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
1146 | set_bit(R5_UPTODATE, &dev->flags); | 1183 | set_bit(R5_UPTODATE, &dev->flags); |
1147 | if (fua) | 1184 | if (fua) |
1148 | set_bit(R5_WantFUA, &dev->flags); | 1185 | set_bit(R5_WantFUA, &dev->flags); |
1186 | if (sync) | ||
1187 | set_bit(R5_SyncIO, &dev->flags); | ||
1149 | } | 1188 | } |
1150 | } | 1189 | } |
1151 | 1190 | ||
@@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1648 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1687 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
1649 | char b[BDEVNAME_SIZE]; | 1688 | char b[BDEVNAME_SIZE]; |
1650 | struct md_rdev *rdev = NULL; | 1689 | struct md_rdev *rdev = NULL; |
1651 | 1690 | sector_t s; | |
1652 | 1691 | ||
1653 | for (i=0 ; i<disks; i++) | 1692 | for (i=0 ; i<disks; i++) |
1654 | if (bi == &sh->dev[i].req) | 1693 | if (bi == &sh->dev[i].req) |
@@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1671 | if (!rdev) | 1710 | if (!rdev) |
1672 | rdev = conf->disks[i].rdev; | 1711 | rdev = conf->disks[i].rdev; |
1673 | 1712 | ||
1713 | if (use_new_offset(conf, sh)) | ||
1714 | s = sh->sector + rdev->new_data_offset; | ||
1715 | else | ||
1716 | s = sh->sector + rdev->data_offset; | ||
1674 | if (uptodate) { | 1717 | if (uptodate) { |
1675 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1718 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
1676 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1719 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
@@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1683 | "md/raid:%s: read error corrected" | 1726 | "md/raid:%s: read error corrected" |
1684 | " (%lu sectors at %llu on %s)\n", | 1727 | " (%lu sectors at %llu on %s)\n", |
1685 | mdname(conf->mddev), STRIPE_SECTORS, | 1728 | mdname(conf->mddev), STRIPE_SECTORS, |
1686 | (unsigned long long)(sh->sector | 1729 | (unsigned long long)s, |
1687 | + rdev->data_offset), | ||
1688 | bdevname(rdev->bdev, b)); | 1730 | bdevname(rdev->bdev, b)); |
1689 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | 1731 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); |
1690 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1732 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
@@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1704 | "md/raid:%s: read error on replacement device " | 1746 | "md/raid:%s: read error on replacement device " |
1705 | "(sector %llu on %s).\n", | 1747 | "(sector %llu on %s).\n", |
1706 | mdname(conf->mddev), | 1748 | mdname(conf->mddev), |
1707 | (unsigned long long)(sh->sector | 1749 | (unsigned long long)s, |
1708 | + rdev->data_offset), | ||
1709 | bdn); | 1750 | bdn); |
1710 | else if (conf->mddev->degraded >= conf->max_degraded) | 1751 | else if (conf->mddev->degraded >= conf->max_degraded) |
1711 | printk_ratelimited( | 1752 | printk_ratelimited( |
@@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1713 | "md/raid:%s: read error not correctable " | 1754 | "md/raid:%s: read error not correctable " |
1714 | "(sector %llu on %s).\n", | 1755 | "(sector %llu on %s).\n", |
1715 | mdname(conf->mddev), | 1756 | mdname(conf->mddev), |
1716 | (unsigned long long)(sh->sector | 1757 | (unsigned long long)s, |
1717 | + rdev->data_offset), | ||
1718 | bdn); | 1758 | bdn); |
1719 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1759 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) |
1720 | /* Oh, no!!! */ | 1760 | /* Oh, no!!! */ |
@@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1723 | "md/raid:%s: read error NOT corrected!! " | 1763 | "md/raid:%s: read error NOT corrected!! " |
1724 | "(sector %llu on %s).\n", | 1764 | "(sector %llu on %s).\n", |
1725 | mdname(conf->mddev), | 1765 | mdname(conf->mddev), |
1726 | (unsigned long long)(sh->sector | 1766 | (unsigned long long)s, |
1727 | + rdev->data_offset), | ||
1728 | bdn); | 1767 | bdn); |
1729 | else if (atomic_read(&rdev->read_errors) | 1768 | else if (atomic_read(&rdev->read_errors) |
1730 | > conf->max_nr_stripes) | 1769 | > conf->max_nr_stripes) |
@@ -3561,7 +3600,7 @@ finish: | |||
3561 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { | 3600 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { |
3562 | rdev = conf->disks[i].rdev; | 3601 | rdev = conf->disks[i].rdev; |
3563 | rdev_clear_badblocks(rdev, sh->sector, | 3602 | rdev_clear_badblocks(rdev, sh->sector, |
3564 | STRIPE_SECTORS); | 3603 | STRIPE_SECTORS, 0); |
3565 | rdev_dec_pending(rdev, conf->mddev); | 3604 | rdev_dec_pending(rdev, conf->mddev); |
3566 | } | 3605 | } |
3567 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { | 3606 | if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { |
@@ -3570,7 +3609,7 @@ finish: | |||
3570 | /* rdev have been moved down */ | 3609 | /* rdev have been moved down */ |
3571 | rdev = conf->disks[i].rdev; | 3610 | rdev = conf->disks[i].rdev; |
3572 | rdev_clear_badblocks(rdev, sh->sector, | 3611 | rdev_clear_badblocks(rdev, sh->sector, |
3573 | STRIPE_SECTORS); | 3612 | STRIPE_SECTORS, 0); |
3574 | rdev_dec_pending(rdev, conf->mddev); | 3613 | rdev_dec_pending(rdev, conf->mddev); |
3575 | } | 3614 | } |
3576 | } | 3615 | } |
@@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) | |||
3842 | raid_bio->bi_next = (void*)rdev; | 3881 | raid_bio->bi_next = (void*)rdev; |
3843 | align_bi->bi_bdev = rdev->bdev; | 3882 | align_bi->bi_bdev = rdev->bdev; |
3844 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 3883 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
3884 | /* No reshape active, so we can trust rdev->data_offset */ | ||
3845 | align_bi->bi_sector += rdev->data_offset; | 3885 | align_bi->bi_sector += rdev->data_offset; |
3846 | 3886 | ||
3847 | if (!bio_fits_rdev(align_bi) || | 3887 | if (!bio_fits_rdev(align_bi) || |
@@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
3953 | plugged = mddev_check_plugged(mddev); | 3993 | plugged = mddev_check_plugged(mddev); |
3954 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | 3994 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { |
3955 | DEFINE_WAIT(w); | 3995 | DEFINE_WAIT(w); |
3956 | int disks, data_disks; | ||
3957 | int previous; | 3996 | int previous; |
3958 | 3997 | ||
3959 | retry: | 3998 | retry: |
3960 | previous = 0; | 3999 | previous = 0; |
3961 | disks = conf->raid_disks; | ||
3962 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); | 4000 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); |
3963 | if (unlikely(conf->reshape_progress != MaxSector)) { | 4001 | if (unlikely(conf->reshape_progress != MaxSector)) { |
3964 | /* spinlock is needed as reshape_progress may be | 4002 | /* spinlock is needed as reshape_progress may be |
@@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
3970 | * to check again. | 4008 | * to check again. |
3971 | */ | 4009 | */ |
3972 | spin_lock_irq(&conf->device_lock); | 4010 | spin_lock_irq(&conf->device_lock); |
3973 | if (mddev->delta_disks < 0 | 4011 | if (mddev->reshape_backwards |
3974 | ? logical_sector < conf->reshape_progress | 4012 | ? logical_sector < conf->reshape_progress |
3975 | : logical_sector >= conf->reshape_progress) { | 4013 | : logical_sector >= conf->reshape_progress) { |
3976 | disks = conf->previous_raid_disks; | ||
3977 | previous = 1; | 4014 | previous = 1; |
3978 | } else { | 4015 | } else { |
3979 | if (mddev->delta_disks < 0 | 4016 | if (mddev->reshape_backwards |
3980 | ? logical_sector < conf->reshape_safe | 4017 | ? logical_sector < conf->reshape_safe |
3981 | : logical_sector >= conf->reshape_safe) { | 4018 | : logical_sector >= conf->reshape_safe) { |
3982 | spin_unlock_irq(&conf->device_lock); | 4019 | spin_unlock_irq(&conf->device_lock); |
@@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
3986 | } | 4023 | } |
3987 | spin_unlock_irq(&conf->device_lock); | 4024 | spin_unlock_irq(&conf->device_lock); |
3988 | } | 4025 | } |
3989 | data_disks = disks - conf->max_degraded; | ||
3990 | 4026 | ||
3991 | new_sector = raid5_compute_sector(conf, logical_sector, | 4027 | new_sector = raid5_compute_sector(conf, logical_sector, |
3992 | previous, | 4028 | previous, |
@@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4009 | */ | 4045 | */ |
4010 | int must_retry = 0; | 4046 | int must_retry = 0; |
4011 | spin_lock_irq(&conf->device_lock); | 4047 | spin_lock_irq(&conf->device_lock); |
4012 | if (mddev->delta_disks < 0 | 4048 | if (mddev->reshape_backwards |
4013 | ? logical_sector >= conf->reshape_progress | 4049 | ? logical_sector >= conf->reshape_progress |
4014 | : logical_sector < conf->reshape_progress) | 4050 | : logical_sector < conf->reshape_progress) |
4015 | /* mismatch, need to try again */ | 4051 | /* mismatch, need to try again */ |
@@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4108 | 4144 | ||
4109 | if (sector_nr == 0) { | 4145 | if (sector_nr == 0) { |
4110 | /* If restarting in the middle, skip the initial sectors */ | 4146 | /* If restarting in the middle, skip the initial sectors */ |
4111 | if (mddev->delta_disks < 0 && | 4147 | if (mddev->reshape_backwards && |
4112 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { | 4148 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { |
4113 | sector_nr = raid5_size(mddev, 0, 0) | 4149 | sector_nr = raid5_size(mddev, 0, 0) |
4114 | - conf->reshape_progress; | 4150 | - conf->reshape_progress; |
4115 | } else if (mddev->delta_disks >= 0 && | 4151 | } else if (!mddev->reshape_backwards && |
4116 | conf->reshape_progress > 0) | 4152 | conf->reshape_progress > 0) |
4117 | sector_nr = conf->reshape_progress; | 4153 | sector_nr = conf->reshape_progress; |
4118 | sector_div(sector_nr, new_data_disks); | 4154 | sector_div(sector_nr, new_data_disks); |
@@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4133 | else | 4169 | else |
4134 | reshape_sectors = mddev->chunk_sectors; | 4170 | reshape_sectors = mddev->chunk_sectors; |
4135 | 4171 | ||
4136 | /* we update the metadata when there is more than 3Meg | 4172 | /* We update the metadata at least every 10 seconds, or when |
4137 | * in the block range (that is rather arbitrary, should | 4173 | * the data about to be copied would over-write the source of |
4138 | * probably be time based) or when the data about to be | 4174 | * the data at the front of the range. i.e. one new_stripe |
4139 | * copied would over-write the source of the data at | 4175 | * along from reshape_progress new_maps to after where |
4140 | * the front of the range. | 4176 | * reshape_safe old_maps to |
4141 | * i.e. one new_stripe along from reshape_progress new_maps | ||
4142 | * to after where reshape_safe old_maps to | ||
4143 | */ | 4177 | */ |
4144 | writepos = conf->reshape_progress; | 4178 | writepos = conf->reshape_progress; |
4145 | sector_div(writepos, new_data_disks); | 4179 | sector_div(writepos, new_data_disks); |
@@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4147 | sector_div(readpos, data_disks); | 4181 | sector_div(readpos, data_disks); |
4148 | safepos = conf->reshape_safe; | 4182 | safepos = conf->reshape_safe; |
4149 | sector_div(safepos, data_disks); | 4183 | sector_div(safepos, data_disks); |
4150 | if (mddev->delta_disks < 0) { | 4184 | if (mddev->reshape_backwards) { |
4151 | writepos -= min_t(sector_t, reshape_sectors, writepos); | 4185 | writepos -= min_t(sector_t, reshape_sectors, writepos); |
4152 | readpos += reshape_sectors; | 4186 | readpos += reshape_sectors; |
4153 | safepos += reshape_sectors; | 4187 | safepos += reshape_sectors; |
@@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4157 | safepos -= min_t(sector_t, reshape_sectors, safepos); | 4191 | safepos -= min_t(sector_t, reshape_sectors, safepos); |
4158 | } | 4192 | } |
4159 | 4193 | ||
4194 | /* Having calculated the 'writepos' possibly use it | ||
4195 | * to set 'stripe_addr' which is where we will write to. | ||
4196 | */ | ||
4197 | if (mddev->reshape_backwards) { | ||
4198 | BUG_ON(conf->reshape_progress == 0); | ||
4199 | stripe_addr = writepos; | ||
4200 | BUG_ON((mddev->dev_sectors & | ||
4201 | ~((sector_t)reshape_sectors - 1)) | ||
4202 | - reshape_sectors - stripe_addr | ||
4203 | != sector_nr); | ||
4204 | } else { | ||
4205 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
4206 | stripe_addr = sector_nr; | ||
4207 | } | ||
4208 | |||
4160 | /* 'writepos' is the most advanced device address we might write. | 4209 | /* 'writepos' is the most advanced device address we might write. |
4161 | * 'readpos' is the least advanced device address we might read. | 4210 | * 'readpos' is the least advanced device address we might read. |
4162 | * 'safepos' is the least address recorded in the metadata as having | 4211 | * 'safepos' is the least address recorded in the metadata as having |
4163 | * been reshaped. | 4212 | * been reshaped. |
4164 | * If 'readpos' is behind 'writepos', then there is no way that we can | 4213 | * If there is a min_offset_diff, these are adjusted either by |
4214 | * increasing the safepos/readpos if diff is negative, or | ||
4215 | * increasing writepos if diff is positive. | ||
4216 | * If 'readpos' is then behind 'writepos', there is no way that we can | ||
4165 | * ensure safety in the face of a crash - that must be done by userspace | 4217 | * ensure safety in the face of a crash - that must be done by userspace |
4166 | * making a backup of the data. So in that case there is no particular | 4218 | * making a backup of the data. So in that case there is no particular |
4167 | * rush to update metadata. | 4219 | * rush to update metadata. |
@@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4174 | * Maybe that number should be configurable, but I'm not sure it is | 4226 | * Maybe that number should be configurable, but I'm not sure it is |
4175 | * worth it.... maybe it could be a multiple of safemode_delay??? | 4227 | * worth it.... maybe it could be a multiple of safemode_delay??? |
4176 | */ | 4228 | */ |
4177 | if ((mddev->delta_disks < 0 | 4229 | if (conf->min_offset_diff < 0) { |
4230 | safepos += -conf->min_offset_diff; | ||
4231 | readpos += -conf->min_offset_diff; | ||
4232 | } else | ||
4233 | writepos += conf->min_offset_diff; | ||
4234 | |||
4235 | if ((mddev->reshape_backwards | ||
4178 | ? (safepos > writepos && readpos < writepos) | 4236 | ? (safepos > writepos && readpos < writepos) |
4179 | : (safepos < writepos && readpos > writepos)) || | 4237 | : (safepos < writepos && readpos > writepos)) || |
4180 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | 4238 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { |
@@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4195 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 4253 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
4196 | } | 4254 | } |
4197 | 4255 | ||
4198 | if (mddev->delta_disks < 0) { | ||
4199 | BUG_ON(conf->reshape_progress == 0); | ||
4200 | stripe_addr = writepos; | ||
4201 | BUG_ON((mddev->dev_sectors & | ||
4202 | ~((sector_t)reshape_sectors - 1)) | ||
4203 | - reshape_sectors - stripe_addr | ||
4204 | != sector_nr); | ||
4205 | } else { | ||
4206 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
4207 | stripe_addr = sector_nr; | ||
4208 | } | ||
4209 | INIT_LIST_HEAD(&stripes); | 4256 | INIT_LIST_HEAD(&stripes); |
4210 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { | 4257 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { |
4211 | int j; | 4258 | int j; |
@@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
4239 | list_add(&sh->lru, &stripes); | 4286 | list_add(&sh->lru, &stripes); |
4240 | } | 4287 | } |
4241 | spin_lock_irq(&conf->device_lock); | 4288 | spin_lock_irq(&conf->device_lock); |
4242 | if (mddev->delta_disks < 0) | 4289 | if (mddev->reshape_backwards) |
4243 | conf->reshape_progress -= reshape_sectors * new_data_disks; | 4290 | conf->reshape_progress -= reshape_sectors * new_data_disks; |
4244 | else | 4291 | else |
4245 | conf->reshape_progress += reshape_sectors * new_data_disks; | 4292 | conf->reshape_progress += reshape_sectors * new_data_disks; |
@@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev) | |||
4952 | struct md_rdev *rdev; | 4999 | struct md_rdev *rdev; |
4953 | sector_t reshape_offset = 0; | 5000 | sector_t reshape_offset = 0; |
4954 | int i; | 5001 | int i; |
5002 | long long min_offset_diff = 0; | ||
5003 | int first = 1; | ||
4955 | 5004 | ||
4956 | if (mddev->recovery_cp != MaxSector) | 5005 | if (mddev->recovery_cp != MaxSector) |
4957 | printk(KERN_NOTICE "md/raid:%s: not clean" | 5006 | printk(KERN_NOTICE "md/raid:%s: not clean" |
4958 | " -- starting background reconstruction\n", | 5007 | " -- starting background reconstruction\n", |
4959 | mdname(mddev)); | 5008 | mdname(mddev)); |
5009 | |||
5010 | rdev_for_each(rdev, mddev) { | ||
5011 | long long diff; | ||
5012 | if (rdev->raid_disk < 0) | ||
5013 | continue; | ||
5014 | diff = (rdev->new_data_offset - rdev->data_offset); | ||
5015 | if (first) { | ||
5016 | min_offset_diff = diff; | ||
5017 | first = 0; | ||
5018 | } else if (mddev->reshape_backwards && | ||
5019 | diff < min_offset_diff) | ||
5020 | min_offset_diff = diff; | ||
5021 | else if (!mddev->reshape_backwards && | ||
5022 | diff > min_offset_diff) | ||
5023 | min_offset_diff = diff; | ||
5024 | } | ||
5025 | |||
4960 | if (mddev->reshape_position != MaxSector) { | 5026 | if (mddev->reshape_position != MaxSector) { |
4961 | /* Check that we can continue the reshape. | 5027 | /* Check that we can continue the reshape. |
4962 | * Currently only disks can change, it must | 5028 | * Difficulties arise if the stripe we would write to |
4963 | * increase, and we must be past the point where | 5029 | * next is at or after the stripe we would read from next. |
4964 | * a stripe over-writes itself | 5030 | * For a reshape that changes the number of devices, this |
5031 | * is only possible for a very short time, and mdadm makes | ||
5032 | * sure that time appears to have past before assembling | ||
5033 | * the array. So we fail if that time hasn't passed. | ||
5034 | * For a reshape that keeps the number of devices the same | ||
5035 | * mdadm must be monitoring the reshape can keeping the | ||
5036 | * critical areas read-only and backed up. It will start | ||
5037 | * the array in read-only mode, so we check for that. | ||
4965 | */ | 5038 | */ |
4966 | sector_t here_new, here_old; | 5039 | sector_t here_new, here_old; |
4967 | int old_disks; | 5040 | int old_disks; |
@@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev) | |||
4993 | /* here_old is the first stripe that we might need to read | 5066 | /* here_old is the first stripe that we might need to read |
4994 | * from */ | 5067 | * from */ |
4995 | if (mddev->delta_disks == 0) { | 5068 | if (mddev->delta_disks == 0) { |
5069 | if ((here_new * mddev->new_chunk_sectors != | ||
5070 | here_old * mddev->chunk_sectors)) { | ||
5071 | printk(KERN_ERR "md/raid:%s: reshape position is" | ||
5072 | " confused - aborting\n", mdname(mddev)); | ||
5073 | return -EINVAL; | ||
5074 | } | ||
4996 | /* We cannot be sure it is safe to start an in-place | 5075 | /* We cannot be sure it is safe to start an in-place |
4997 | * reshape. It is only safe if user-space if monitoring | 5076 | * reshape. It is only safe if user-space is monitoring |
4998 | * and taking constant backups. | 5077 | * and taking constant backups. |
4999 | * mdadm always starts a situation like this in | 5078 | * mdadm always starts a situation like this in |
5000 | * readonly mode so it can take control before | 5079 | * readonly mode so it can take control before |
5001 | * allowing any writes. So just check for that. | 5080 | * allowing any writes. So just check for that. |
5002 | */ | 5081 | */ |
5003 | if ((here_new * mddev->new_chunk_sectors != | 5082 | if (abs(min_offset_diff) >= mddev->chunk_sectors && |
5004 | here_old * mddev->chunk_sectors) || | 5083 | abs(min_offset_diff) >= mddev->new_chunk_sectors) |
5005 | mddev->ro == 0) { | 5084 | /* not really in-place - so OK */; |
5006 | printk(KERN_ERR "md/raid:%s: in-place reshape must be started" | 5085 | else if (mddev->ro == 0) { |
5007 | " in read-only mode - aborting\n", | 5086 | printk(KERN_ERR "md/raid:%s: in-place reshape " |
5087 | "must be started in read-only mode " | ||
5088 | "- aborting\n", | ||
5008 | mdname(mddev)); | 5089 | mdname(mddev)); |
5009 | return -EINVAL; | 5090 | return -EINVAL; |
5010 | } | 5091 | } |
5011 | } else if (mddev->delta_disks < 0 | 5092 | } else if (mddev->reshape_backwards |
5012 | ? (here_new * mddev->new_chunk_sectors <= | 5093 | ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= |
5013 | here_old * mddev->chunk_sectors) | 5094 | here_old * mddev->chunk_sectors) |
5014 | : (here_new * mddev->new_chunk_sectors >= | 5095 | : (here_new * mddev->new_chunk_sectors >= |
5015 | here_old * mddev->chunk_sectors)) { | 5096 | here_old * mddev->chunk_sectors + (-min_offset_diff))) { |
5016 | /* Reading from the same stripe as writing to - bad */ | 5097 | /* Reading from the same stripe as writing to - bad */ |
5017 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " | 5098 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " |
5018 | "auto-recovery - aborting.\n", | 5099 | "auto-recovery - aborting.\n", |
@@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev) | |||
5037 | if (IS_ERR(conf)) | 5118 | if (IS_ERR(conf)) |
5038 | return PTR_ERR(conf); | 5119 | return PTR_ERR(conf); |
5039 | 5120 | ||
5121 | conf->min_offset_diff = min_offset_diff; | ||
5040 | mddev->thread = conf->thread; | 5122 | mddev->thread = conf->thread; |
5041 | conf->thread = NULL; | 5123 | conf->thread = NULL; |
5042 | mddev->private = conf; | 5124 | mddev->private = conf; |
@@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev) | |||
5182 | blk_queue_io_opt(mddev->queue, chunk_size * | 5264 | blk_queue_io_opt(mddev->queue, chunk_size * |
5183 | (conf->raid_disks - conf->max_degraded)); | 5265 | (conf->raid_disks - conf->max_degraded)); |
5184 | 5266 | ||
5185 | rdev_for_each(rdev, mddev) | 5267 | rdev_for_each(rdev, mddev) { |
5186 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 5268 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
5187 | rdev->data_offset << 9); | 5269 | rdev->data_offset << 9); |
5270 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
5271 | rdev->new_data_offset << 9); | ||
5272 | } | ||
5188 | } | 5273 | } |
5189 | 5274 | ||
5190 | return 0; | 5275 | return 0; |
@@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) | |||
5418 | * any io in the removed space completes, but it hardly seems | 5503 | * any io in the removed space completes, but it hardly seems |
5419 | * worth it. | 5504 | * worth it. |
5420 | */ | 5505 | */ |
5506 | sector_t newsize; | ||
5421 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); | 5507 | sectors &= ~((sector_t)mddev->chunk_sectors - 1); |
5422 | md_set_array_sectors(mddev, raid5_size(mddev, sectors, | 5508 | newsize = raid5_size(mddev, sectors, mddev->raid_disks); |
5423 | mddev->raid_disks)); | 5509 | if (mddev->external_size && |
5424 | if (mddev->array_sectors > | 5510 | mddev->array_sectors > newsize) |
5425 | raid5_size(mddev, sectors, mddev->raid_disks)) | ||
5426 | return -EINVAL; | 5511 | return -EINVAL; |
5512 | if (mddev->bitmap) { | ||
5513 | int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); | ||
5514 | if (ret) | ||
5515 | return ret; | ||
5516 | } | ||
5517 | md_set_array_sectors(mddev, newsize); | ||
5427 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5518 | set_capacity(mddev->gendisk, mddev->array_sectors); |
5428 | revalidate_disk(mddev->gendisk); | 5519 | revalidate_disk(mddev->gendisk); |
5429 | if (sectors > mddev->dev_sectors && | 5520 | if (sectors > mddev->dev_sectors && |
@@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev) | |||
5468 | mddev->new_layout == mddev->layout && | 5559 | mddev->new_layout == mddev->layout && |
5469 | mddev->new_chunk_sectors == mddev->chunk_sectors) | 5560 | mddev->new_chunk_sectors == mddev->chunk_sectors) |
5470 | return 0; /* nothing to do */ | 5561 | return 0; /* nothing to do */ |
5471 | if (mddev->bitmap) | ||
5472 | /* Cannot grow a bitmap yet */ | ||
5473 | return -EBUSY; | ||
5474 | if (has_failed(conf)) | 5562 | if (has_failed(conf)) |
5475 | return -EINVAL; | 5563 | return -EINVAL; |
5476 | if (mddev->delta_disks < 0) { | 5564 | if (mddev->delta_disks < 0) { |
@@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5505 | if (!check_stripe_cache(mddev)) | 5593 | if (!check_stripe_cache(mddev)) |
5506 | return -ENOSPC; | 5594 | return -ENOSPC; |
5507 | 5595 | ||
5508 | rdev_for_each(rdev, mddev) | 5596 | if (has_failed(conf)) |
5597 | return -EINVAL; | ||
5598 | |||
5599 | rdev_for_each(rdev, mddev) { | ||
5509 | if (!test_bit(In_sync, &rdev->flags) | 5600 | if (!test_bit(In_sync, &rdev->flags) |
5510 | && !test_bit(Faulty, &rdev->flags)) | 5601 | && !test_bit(Faulty, &rdev->flags)) |
5511 | spares++; | 5602 | spares++; |
5603 | } | ||
5512 | 5604 | ||
5513 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) | 5605 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) |
5514 | /* Not enough devices even to make a degraded array | 5606 | /* Not enough devices even to make a degraded array |
@@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5535 | conf->chunk_sectors = mddev->new_chunk_sectors; | 5627 | conf->chunk_sectors = mddev->new_chunk_sectors; |
5536 | conf->prev_algo = conf->algorithm; | 5628 | conf->prev_algo = conf->algorithm; |
5537 | conf->algorithm = mddev->new_layout; | 5629 | conf->algorithm = mddev->new_layout; |
5538 | if (mddev->delta_disks < 0) | 5630 | conf->generation++; |
5631 | /* Code that selects data_offset needs to see the generation update | ||
5632 | * if reshape_progress has been set - so a memory barrier needed. | ||
5633 | */ | ||
5634 | smp_mb(); | ||
5635 | if (mddev->reshape_backwards) | ||
5539 | conf->reshape_progress = raid5_size(mddev, 0, 0); | 5636 | conf->reshape_progress = raid5_size(mddev, 0, 0); |
5540 | else | 5637 | else |
5541 | conf->reshape_progress = 0; | 5638 | conf->reshape_progress = 0; |
5542 | conf->reshape_safe = conf->reshape_progress; | 5639 | conf->reshape_safe = conf->reshape_progress; |
5543 | conf->generation++; | ||
5544 | spin_unlock_irq(&conf->device_lock); | 5640 | spin_unlock_irq(&conf->device_lock); |
5545 | 5641 | ||
5546 | /* Add some new drives, as many as will fit. | 5642 | /* Add some new drives, as many as will fit. |
@@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5592 | mddev->recovery = 0; | 5688 | mddev->recovery = 0; |
5593 | spin_lock_irq(&conf->device_lock); | 5689 | spin_lock_irq(&conf->device_lock); |
5594 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | 5690 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; |
5691 | rdev_for_each(rdev, mddev) | ||
5692 | rdev->new_data_offset = rdev->data_offset; | ||
5693 | smp_wmb(); | ||
5595 | conf->reshape_progress = MaxSector; | 5694 | conf->reshape_progress = MaxSector; |
5596 | mddev->reshape_position = MaxSector; | 5695 | mddev->reshape_position = MaxSector; |
5597 | spin_unlock_irq(&conf->device_lock); | 5696 | spin_unlock_irq(&conf->device_lock); |
@@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf) | |||
5610 | { | 5709 | { |
5611 | 5710 | ||
5612 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { | 5711 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { |
5712 | struct md_rdev *rdev; | ||
5613 | 5713 | ||
5614 | spin_lock_irq(&conf->device_lock); | 5714 | spin_lock_irq(&conf->device_lock); |
5615 | conf->previous_raid_disks = conf->raid_disks; | 5715 | conf->previous_raid_disks = conf->raid_disks; |
5716 | rdev_for_each(rdev, conf->mddev) | ||
5717 | rdev->data_offset = rdev->new_data_offset; | ||
5718 | smp_wmb(); | ||
5616 | conf->reshape_progress = MaxSector; | 5719 | conf->reshape_progress = MaxSector; |
5617 | spin_unlock_irq(&conf->device_lock); | 5720 | spin_unlock_irq(&conf->device_lock); |
5618 | wake_up(&conf->wait_for_overlap); | 5721 | wake_up(&conf->wait_for_overlap); |
@@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev) | |||
5652 | d < conf->raid_disks - mddev->delta_disks; | 5755 | d < conf->raid_disks - mddev->delta_disks; |
5653 | d++) { | 5756 | d++) { |
5654 | struct md_rdev *rdev = conf->disks[d].rdev; | 5757 | struct md_rdev *rdev = conf->disks[d].rdev; |
5655 | if (rdev && | 5758 | if (rdev) |
5656 | raid5_remove_disk(mddev, rdev) == 0) { | 5759 | clear_bit(In_sync, &rdev->flags); |
5657 | sysfs_unlink_rdev(mddev, rdev); | 5760 | rdev = conf->disks[d].replacement; |
5658 | rdev->raid_disk = -1; | 5761 | if (rdev) |
5659 | } | 5762 | clear_bit(In_sync, &rdev->flags); |
5660 | } | 5763 | } |
5661 | } | 5764 | } |
5662 | mddev->layout = conf->algorithm; | 5765 | mddev->layout = conf->algorithm; |
5663 | mddev->chunk_sectors = conf->chunk_sectors; | 5766 | mddev->chunk_sectors = conf->chunk_sectors; |
5664 | mddev->reshape_position = MaxSector; | 5767 | mddev->reshape_position = MaxSector; |
5665 | mddev->delta_disks = 0; | 5768 | mddev->delta_disks = 0; |
5769 | mddev->reshape_backwards = 0; | ||
5666 | } | 5770 | } |
5667 | } | 5771 | } |
5668 | 5772 | ||