aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/md/dm-raid.c505
1 files changed, 447 insertions, 58 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 14835ae064c1..e4c41232107f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -189,6 +189,7 @@ struct raid_dev {
189#define RT_FLAG_RS_RESUMED 1 189#define RT_FLAG_RS_RESUMED 1
190#define RT_FLAG_RS_BITMAP_LOADED 2 190#define RT_FLAG_RS_BITMAP_LOADED 2
191#define RT_FLAG_UPDATE_SBS 3 191#define RT_FLAG_UPDATE_SBS 3
192#define RT_FLAG_RESHAPE_RS 4
192 193
193/* Array elements of 64 bit needed for rebuild/write_mostly bits */ 194/* Array elements of 64 bit needed for rebuild/write_mostly bits */
194#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) 195#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -206,6 +207,7 @@ struct raid_set {
206 struct dm_target *ti; 207 struct dm_target *ti;
207 208
208 uint32_t bitmap_loaded; 209 uint32_t bitmap_loaded;
210 uint32_t stripe_cache_entries;
209 unsigned long ctr_flags; 211 unsigned long ctr_flags;
210 unsigned long runtime_flags; 212 unsigned long runtime_flags;
211 213
@@ -219,25 +221,22 @@ struct raid_set {
219 struct mddev md; 221 struct mddev md;
220 struct raid_type *raid_type; 222 struct raid_type *raid_type;
221 struct dm_target_callbacks callbacks; 223 struct dm_target_callbacks callbacks;
222 struct rs_layout rs_layout;
223 224
224 struct raid_dev dev[0]; 225 struct raid_dev dev[0];
225}; 226};
226 227
227static void rs_config_backup(struct raid_set *rs) 228static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
228{ 229{
229 struct mddev *mddev = &rs->md; 230 struct mddev *mddev = &rs->md;
230 struct rs_layout *l = &rs->rs_layout;
231 231
232 l->new_level = mddev->new_level; 232 l->new_level = mddev->new_level;
233 l->new_layout = mddev->new_layout; 233 l->new_layout = mddev->new_layout;
234 l->new_chunk_sectors = mddev->new_chunk_sectors; 234 l->new_chunk_sectors = mddev->new_chunk_sectors;
235} 235}
236 236
237static void rs_config_restore(struct raid_set *rs) 237static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
238{ 238{
239 struct mddev *mddev = &rs->md; 239 struct mddev *mddev = &rs->md;
240 struct rs_layout *l = &rs->rs_layout;
241 240
242 mddev->new_level = l->new_level; 241 mddev->new_level = l->new_level;
243 mddev->new_layout = l->new_layout; 242 mddev->new_layout = l->new_layout;
@@ -336,6 +335,12 @@ static bool rs_is_raid0(struct raid_set *rs)
336 return !rs->md.level; 335 return !rs->md.level;
337} 336}
338 337
338/* Return true, if raid set in @rs is raid1 */
339static bool rs_is_raid1(struct raid_set *rs)
340{
341 return rs->md.level == 1;
342}
343
339/* Return true, if raid set in @rs is raid10 */ 344/* Return true, if raid set in @rs is raid10 */
340static bool rs_is_raid10(struct raid_set *rs) 345static bool rs_is_raid10(struct raid_set *rs)
341{ 346{
@@ -356,6 +361,20 @@ static bool rs_is_reshapable(struct raid_set *rs)
356 (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout)); 361 (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout));
357} 362}
358 363
364/* Return true, if raid set in @rs is recovering */
365static bool rs_is_recovering(struct raid_set *rs)
366{
367 smp_rmb();
368 return rs->md.recovery_cp != MaxSector;
369}
370
371/* Return true, if raid set in @rs is reshaping */
372static bool rs_is_reshaping(struct raid_set *rs)
373{
374 smp_rmb();
375 return rs->md.reshape_position != MaxSector;
376}
377
359/* 378/*
360 * bool helpers to test for various raid levels of a raid type 379 * bool helpers to test for various raid levels of a raid type
361 */ 380 */
@@ -591,6 +610,24 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
591} 610}
592 611
593/* 612/*
613 * Conditionally change bdev capacity of @rs
614 * in case of a disk add/remove reshape
615 */
616static void rs_set_capacity(struct raid_set *rs)
617{
618 struct mddev *mddev = &rs->md;
619
620 /* Make sure we access most actual mddev properties */
621 smp_rmb();
622 if (rs->ti->len != mddev->array_sectors && !rs_is_reshaping(rs)) {
623 struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
624
625 set_capacity(gendisk, mddev->array_sectors);
626 revalidate_disk(gendisk);
627 }
628}
629
630/*
594 * Set the mddev properties in @rs to the current 631 * Set the mddev properties in @rs to the current
595 * ones retrieved from the freshest superblock 632 * ones retrieved from the freshest superblock
596 */ 633 */
@@ -642,6 +679,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
642 679
643 rs->ti = ti; 680 rs->ti = ti;
644 rs->raid_type = raid_type; 681 rs->raid_type = raid_type;
682 rs->stripe_cache_entries = 256;
645 rs->md.raid_disks = raid_devs; 683 rs->md.raid_disks = raid_devs;
646 rs->md.level = raid_type->level; 684 rs->md.level = raid_type->level;
647 rs->md.new_level = rs->md.level; 685 rs->md.new_level = rs->md.level;
@@ -874,7 +912,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
874static int validate_raid_redundancy(struct raid_set *rs) 912static int validate_raid_redundancy(struct raid_set *rs)
875{ 913{
876 unsigned i, rebuild_cnt = 0; 914 unsigned i, rebuild_cnt = 0;
877 unsigned rebuilds_per_group = 0, copies, d; 915 unsigned rebuilds_per_group = 0, copies;
878 unsigned group_size, last_group_start; 916 unsigned group_size, last_group_start;
879 917
880 for (i = 0; i < rs->md.raid_disks; i++) 918 for (i = 0; i < rs->md.raid_disks; i++)
@@ -894,7 +932,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
894 goto too_many; 932 goto too_many;
895 break; 933 break;
896 case 10: 934 case 10:
897 copies = raid10_md_layout_to_copies(rs->md.layout); 935 copies = raid10_md_layout_to_copies(rs->md.new_layout);
898 if (rebuild_cnt < copies) 936 if (rebuild_cnt < copies)
899 break; 937 break;
900 938
@@ -912,12 +950,11 @@ static int validate_raid_redundancy(struct raid_set *rs)
912 * A A B B C 950 * A A B B C
913 * C D D E E 951 * C D D E E
914 */ 952 */
915 if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) { 953 if (__is_raid10_near(rs->md.new_layout)) {
916 for (i = 0; i < rs->md.raid_disks * copies; i++) { 954 for (i = 0; i < rs->raid_disks; i++) {
917 if (!(i % copies)) 955 if (!(i % copies))
918 rebuilds_per_group = 0; 956 rebuilds_per_group = 0;
919 d = i % rs->md.raid_disks; 957 if ((!rs->dev[i].rdev.sb_page ||
920 if ((!rs->dev[d].rdev.sb_page ||
921 !test_bit(In_sync, &rs->dev[i].rdev.flags)) && 958 !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
922 (++rebuilds_per_group >= copies)) 959 (++rebuilds_per_group >= copies))
923 goto too_many; 960 goto too_many;
@@ -986,10 +1023,10 @@ too_many:
986static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, 1023static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
987 unsigned num_raid_params) 1024 unsigned num_raid_params)
988{ 1025{
989 int raid10_format = ALGORITHM_RAID10_DEFAULT; 1026 int value, raid10_format = ALGORITHM_RAID10_DEFAULT;
990 unsigned raid10_copies = 2; 1027 unsigned raid10_copies = 2;
991 unsigned i; 1028 unsigned i;
992 unsigned value, region_size = 0; 1029 unsigned region_size = 0;
993 sector_t max_io_len; 1030 sector_t max_io_len;
994 const char *arg, *key; 1031 const char *arg, *key;
995 struct raid_dev *rd; 1032 struct raid_dev *rd;
@@ -998,7 +1035,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
998 arg = dm_shift_arg(as); 1035 arg = dm_shift_arg(as);
999 num_raid_params--; /* Account for chunk_size argument */ 1036 num_raid_params--; /* Account for chunk_size argument */
1000 1037
1001 if (kstrtouint(arg, 10, &value) < 0) { 1038 if (kstrtoint(arg, 10, &value) < 0) {
1002 rs->ti->error = "Bad numerical argument given for chunk_size"; 1039 rs->ti->error = "Bad numerical argument given for chunk_size";
1003 return -EINVAL; 1040 return -EINVAL;
1004 } 1041 }
@@ -1105,7 +1142,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1105 continue; 1142 continue;
1106 } 1143 }
1107 1144
1108 if (kstrtouint(arg, 10, &value) < 0) { 1145 if (kstrtoint(arg, 10, &value) < 0) {
1109 rs->ti->error = "Bad numerical argument given in raid params"; 1146 rs->ti->error = "Bad numerical argument given in raid params";
1110 return -EINVAL; 1147 return -EINVAL;
1111 } 1148 }
@@ -1207,21 +1244,12 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1207 return -EINVAL; 1244 return -EINVAL;
1208 } 1245 }
1209 1246
1210 /*
1211 * In device-mapper, we specify things in sectors, but
1212 * MD records this value in kB
1213 */
1214 value /= 2;
1215
1216 if (!rt_is_raid456(rt)) { 1247 if (!rt_is_raid456(rt)) {
1217 rs->ti->error = "Inappropriate argument: stripe_cache"; 1248 rs->ti->error = "Inappropriate argument: stripe_cache";
1218 return -EINVAL; 1249 return -EINVAL;
1219 } 1250 }
1220 if (raid5_set_cache_size(&rs->md, (int)value)) {
1221 rs->ti->error = "Bad stripe_cache size";
1222 return -EINVAL;
1223 }
1224 1251
1252 rs->stripe_cache_entries = value;
1225 } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) { 1253 } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) {
1226 if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) { 1254 if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
1227 rs->ti->error = "Only one min_recovery_rate argument pair allowed"; 1255 rs->ti->error = "Only one min_recovery_rate argument pair allowed";
@@ -1303,8 +1331,6 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1303 rs->ti->error = "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible"; 1331 rs->ti->error = "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible";
1304 return -EINVAL; 1332 return -EINVAL;
1305 } 1333 }
1306
1307 rs->md.layout = rs->md.new_layout;
1308 } 1334 }
1309 1335
1310 rs->raid10_copies = raid10_copies; 1336 rs->raid10_copies = raid10_copies;
@@ -1317,6 +1343,46 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1317 return rs_check_for_valid_flags(rs); 1343 return rs_check_for_valid_flags(rs);
1318} 1344}
1319 1345
1346/* Set raid4/5/6 cache size */
1347static int rs_set_raid456_stripe_cache(struct raid_set *rs)
1348{
1349 int r;
1350 struct r5conf *conf;
1351 struct mddev *mddev = &rs->md;
1352 uint32_t min_stripes = max(mddev->chunk_sectors, mddev->new_chunk_sectors) / 2;
1353 uint32_t nr_stripes = rs->stripe_cache_entries;
1354
1355 if (!rt_is_raid456(rs->raid_type)) {
1356 rs->ti->error = "Inappropriate raid level; cannot change stripe_cache size";
1357 return -EINVAL;
1358 }
1359
1360 if (nr_stripes < min_stripes) {
1361 DMINFO("Adjusting requested %u stripe cache entries to %u to suit stripe size",
1362 nr_stripes, min_stripes);
1363 nr_stripes = min_stripes;
1364 }
1365
1366 conf = mddev->private;
1367 if (!conf) {
1368 rs->ti->error = "Cannot change stripe_cache size on inactive RAID set";
1369 return -EINVAL;
1370 }
1371
1372 /* Try setting number of stripes in raid456 stripe cache */
1373 if (conf->min_nr_stripes != nr_stripes) {
1374 r = raid5_set_cache_size(mddev, nr_stripes);
1375 if (r) {
1376 rs->ti->error = "Failed to set raid4/5/6 stripe cache size";
1377 return r;
1378 }
1379
1380 DMINFO("%u stripe cache entries", nr_stripes);
1381 }
1382
1383 return 0;
1384}
1385
1320/* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */ 1386/* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */
1321static unsigned int mddev_data_stripes(struct raid_set *rs) 1387static unsigned int mddev_data_stripes(struct raid_set *rs)
1322{ 1388{
@@ -1337,6 +1403,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
1337 struct mddev *mddev = &rs->md; 1403 struct mddev *mddev = &rs->md;
1338 struct md_rdev *rdev; 1404 struct md_rdev *rdev;
1339 sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len; 1405 sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len;
1406 sector_t cur_dev_sectors = rs->dev[0].rdev.sectors;
1340 1407
1341 if (use_mddev) { 1408 if (use_mddev) {
1342 delta_disks = mddev->delta_disks; 1409 delta_disks = mddev->delta_disks;
@@ -1377,6 +1444,9 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
1377 mddev->array_sectors = array_sectors; 1444 mddev->array_sectors = array_sectors;
1378 mddev->dev_sectors = dev_sectors; 1445 mddev->dev_sectors = dev_sectors;
1379 1446
1447 if (!rs_is_raid0(rs) && dev_sectors > cur_dev_sectors)
1448 mddev->recovery_cp = dev_sectors;
1449
1380 return 0; 1450 return 0;
1381bad: 1451bad:
1382 rs->ti->error = "Target length not divisible by number of data devices"; 1452 rs->ti->error = "Target length not divisible by number of data devices";
@@ -1387,6 +1457,7 @@ static void do_table_event(struct work_struct *ws)
1387{ 1457{
1388 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); 1458 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
1389 1459
1460 rs_set_capacity(rs);
1390 dm_table_event(rs->ti->table); 1461 dm_table_event(rs->ti->table);
1391} 1462}
1392 1463
@@ -1410,6 +1481,17 @@ static int rs_check_takeover(struct raid_set *rs)
1410 struct mddev *mddev = &rs->md; 1481 struct mddev *mddev = &rs->md;
1411 unsigned int near_copies; 1482 unsigned int near_copies;
1412 1483
1484 smp_rmb();
1485 if (rs->md.degraded) {
1486 rs->ti->error = "Can't takeover degraded raid set";
1487 return -EPERM;
1488 }
1489
1490 if (rs_is_reshaping(rs)) {
1491 rs->ti->error = "Can't takeover reshaping raid set";
1492 return -EPERM;
1493 }
1494
1413 switch (mddev->level) { 1495 switch (mddev->level) {
1414 case 0: 1496 case 0:
1415 /* raid0 -> raid1/5 with one disk */ 1497 /* raid0 -> raid1/5 with one disk */
@@ -1419,7 +1501,7 @@ static int rs_check_takeover(struct raid_set *rs)
1419 1501
1420 /* raid0 -> raid10 */ 1502 /* raid0 -> raid10 */
1421 if (mddev->new_level == 10 && 1503 if (mddev->new_level == 10 &&
1422 !(rs->raid_disks % 2)) 1504 !(rs->raid_disks % mddev->raid_disks))
1423 return 0; 1505 return 0;
1424 1506
1425 /* raid0 with multiple disks -> raid4/5/6 */ 1507 /* raid0 with multiple disks -> raid4/5/6 */
@@ -1658,6 +1740,39 @@ struct dm_raid_superblock {
1658 /* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */ 1740 /* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */
1659} __packed; 1741} __packed;
1660 1742
1743/*
1744 * Check for reshape constraints on raid set @rs:
1745 *
1746 * - reshape function non-existent
1747 * - degraded set
1748 * - ongoing recovery
1749 * - ongoing reshape
1750 *
1751 * Returns 0 if none or -EPERM if given constraint
1752 * and error message reference in @errmsg
1753 */
1754static int rs_check_reshape(struct raid_set *rs)
1755{
1756 struct mddev *mddev = &rs->md;
1757
1758 smp_rmb(); /* Make sure we access recent reshape position */
1759
1760 if (!mddev->pers || !mddev->pers->check_reshape)
1761 rs->ti->error = "Reshape not supported";
1762 else if (mddev->degraded)
1763 rs->ti->error = "Can't reshape degraded raid set";
1764 else if (rs_is_recovering(rs))
1765 rs->ti->error = "Convert request on recovering raid set prohibited";
1766 else if (mddev->reshape_position && rs_is_reshaping(rs))
1767 rs->ti->error = "raid set already reshaping!";
1768 else if (!(rs_is_raid10(rs) || rs_is_raid456(rs)))
1769 rs->ti->error = "Reshaping only supported for raid4/5/6/10";
1770 else
1771 return 0;
1772
1773 return -EPERM;
1774}
1775
1661static int read_disk_sb(struct md_rdev *rdev, int size) 1776static int read_disk_sb(struct md_rdev *rdev, int size)
1662{ 1777{
1663 BUG_ON(!rdev->sb_page); 1778 BUG_ON(!rdev->sb_page);
@@ -1936,6 +2051,10 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
1936 * and the new device needs to be rebuilt - in which 2051 * and the new device needs to be rebuilt - in which
1937 * case the In_sync bit will /not/ be set and 2052 * case the In_sync bit will /not/ be set and
1938 * recovery_cp must be MaxSector. 2053 * recovery_cp must be MaxSector.
2054 * 3) This is/are a new device(s) being added to an old
2055 * raid set during takeover to a higher raid level
2056 * to provide capacity for redundancy or during reshape
2057 * to add capacity to grow the raid set.
1939 */ 2058 */
1940 d = 0; 2059 d = 0;
1941 rdev_for_each(r, mddev) { 2060 rdev_for_each(r, mddev) {
@@ -1961,9 +2080,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
1961 if (new_devs == rs->raid_disks) { 2080 if (new_devs == rs->raid_disks) {
1962 DMINFO("Superblocks created for new raid set"); 2081 DMINFO("Superblocks created for new raid set");
1963 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); 2082 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
1964 set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
1965 mddev->recovery_cp = 0; 2083 mddev->recovery_cp = 0;
1966 } else if (new_devs && new_devs != rs->raid_disks && !rebuilds) { 2084 } else if (new_devs != rebuilds &&
2085 new_devs != rs->delta_disks) {
1967 DMERR("New device injected into existing raid set without " 2086 DMERR("New device injected into existing raid set without "
1968 "'delta_disks' or 'rebuild' parameter specified"); 2087 "'delta_disks' or 'rebuild' parameter specified");
1969 return -EINVAL; 2088 return -EINVAL;
@@ -1978,12 +2097,13 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
1978 DMERR("new device%s provided without 'rebuild'", 2097 DMERR("new device%s provided without 'rebuild'",
1979 new_devs > 1 ? "s" : ""); 2098 new_devs > 1 ? "s" : "");
1980 return -EINVAL; 2099 return -EINVAL;
1981 } else if (mddev->recovery_cp != MaxSector) { 2100 } else if (rs_is_recovering(rs)) {
1982 DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", 2101 DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
1983 (unsigned long long) mddev->recovery_cp); 2102 (unsigned long long) mddev->recovery_cp);
1984 return -EINVAL; 2103 return -EINVAL;
1985 } else if (mddev->reshape_position != MaxSector) { 2104 } else if (rs_is_reshaping(rs)) {
1986 DMERR("'rebuild' specified while raid set is being reshaped"); 2105 DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)",
2106 (unsigned long long) mddev->reshape_position);
1987 return -EINVAL; 2107 return -EINVAL;
1988 } 2108 }
1989 } 2109 }
@@ -2082,7 +2202,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
2082 * If no reshape in progress -> we're recovering single 2202 * If no reshape in progress -> we're recovering single
2083 * disk(s) and have to set the device(s) to out-of-sync 2203 * disk(s) and have to set the device(s) to out-of-sync
2084 */ 2204 */
2085 else if (rs->md.reshape_position == MaxSector) 2205 else if (!rs_is_reshaping(rs))
2086 clear_bit(In_sync, &rdev->flags); /* Mandatory for recovery */ 2206 clear_bit(In_sync, &rdev->flags); /* Mandatory for recovery */
2087 } 2207 }
2088 2208
@@ -2181,15 +2301,13 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
2181 * Validation of the freshest device provides the source of 2301 * Validation of the freshest device provides the source of
2182 * validation for the remaining devices. 2302 * validation for the remaining devices.
2183 */ 2303 */
2184 if (super_validate(rs, freshest)) { 2304 rs->ti->error = "Unable to assemble array: Invalid superblocks";
2185 rs->ti->error = "Unable to assemble array: Invalid superblocks"; 2305 if (super_validate(rs, freshest))
2186 return -EINVAL; 2306 return -EINVAL;
2187 }
2188 2307
2189 rdev_for_each(rdev, mddev) 2308 rdev_for_each(rdev, mddev)
2190 if ((rdev != freshest) && super_validate(rs, rdev)) 2309 if ((rdev != freshest) && super_validate(rs, rdev))
2191 return -EINVAL; 2310 return -EINVAL;
2192
2193 return 0; 2311 return 0;
2194} 2312}
2195 2313
@@ -2344,6 +2462,106 @@ static int rs_setup_takeover(struct raid_set *rs)
2344} 2462}
2345 2463
2346/* 2464/*
2465 *
2466 * - change raid layout
2467 * - change chunk size
2468 * - add disks
2469 * - remove disks
2470 */
2471static int rs_setup_reshape(struct raid_set *rs)
2472{
2473 int r = 0;
2474 unsigned int cur_raid_devs, d;
2475 struct mddev *mddev = &rs->md;
2476 struct md_rdev *rdev;
2477
2478 mddev->delta_disks = rs->delta_disks;
2479 cur_raid_devs = mddev->raid_disks;
2480
2481 /* Ignore impossible layout change whilst adding/removing disks */
2482 if (mddev->delta_disks &&
2483 mddev->layout != mddev->new_layout) {
2484 DMINFO("Ignoring invalid layout change with delta_disks=%d", rs->delta_disks);
2485 mddev->new_layout = mddev->layout;
2486 }
2487
2488 /*
2489 * Adjust array size:
2490 *
2491 * - in case of adding disks, array size has
2492 * to grow after the disk adding reshape,
2493 * which'll hapen in the event handler;
2494 * reshape will happen forward, so space has to
2495 * be available at the beginning of each disk
2496 *
2497 * - in case of removing disks, array size
2498 * has to shrink before starting the reshape,
2499 * which'll happen here;
2500 * reshape will happen backward, so space has to
2501 * be available at the end of each disk
2502 *
2503 * - data_offset and new_data_offset are
2504 * adjusted for afreentioned out of place
2505 * reshaping based on userspace passing in
2506 * the "data_offset <sectors>" key/value
2507 * pair via te constructor
2508 */
2509
2510 /* Add disk(s) */
2511 if (rs->delta_disks > 0) {
2512 /* Prepare disks for check in raid4/5/6/10 {check|start}_reshape */
2513 for (d = cur_raid_devs; d < rs->raid_disks; d++) {
2514 rdev = &rs->dev[d].rdev;
2515 clear_bit(In_sync, &rdev->flags);
2516
2517 /*
2518 * save_raid_disk needs to be -1, or recovery_offset will be set to 0
2519 * by md, which'll store that erroneously in the superblock on reshape
2520 */
2521 rdev->saved_raid_disk = -1;
2522 rdev->raid_disk = d;
2523
2524 rdev->sectors = mddev->dev_sectors;
2525 rdev->recovery_offset = MaxSector;
2526 }
2527
2528 mddev->reshape_backwards = 0; /* adding disks -> forward reshape */
2529
2530 /* Remove disk(s) */
2531 } else if (rs->delta_disks < 0) {
2532 r = rs_set_dev_and_array_sectors(rs, true);
2533 mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */
2534
2535 /* Change layout and/or chunk size */
2536 } else {
2537 /*
2538 * Reshape layout (e.g. raid5_ls -> raid5_n) and/or chunk size:
2539 *
2540 * keeping number of disks and do layout change ->
2541 *
2542 * toggle reshape_backward depending on data_offset:
2543 *
2544 * - free space upfront -> reshape forward
2545 *
2546 * - free space at the end -> reshape backward
2547 *
2548 *
2549 * This utilizes free reshape space avoiding the need
2550 * for userspace to move (parts of) LV segments in
2551 * case of layout/chunksize change (for disk
2552 * adding/removing reshape space has to be at
2553 * the proper address (see above with delta_disks):
2554 *
2555 * add disk(s) -> begin
2556 * remove disk(s)-> end
2557 */
2558 mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1;
2559 }
2560
2561 return r;
2562}
2563
2564/*
2347 * Enable/disable discard support on RAID set depending on 2565 * Enable/disable discard support on RAID set depending on
2348 * RAID level and discard properties of underlying RAID members. 2566 * RAID level and discard properties of underlying RAID members.
2349 */ 2567 */
@@ -2411,6 +2629,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
2411 unsigned num_raid_params, num_raid_devs; 2629 unsigned num_raid_params, num_raid_devs;
2412 struct raid_set *rs = NULL; 2630 struct raid_set *rs = NULL;
2413 const char *arg; 2631 const char *arg;
2632 struct rs_layout rs_layout;
2414 struct dm_arg_set as = { argc, argv }, as_nrd; 2633 struct dm_arg_set as = { argc, argv }, as_nrd;
2415 struct dm_arg _args[] = { 2634 struct dm_arg _args[] = {
2416 { 0, as.argc, "Cannot understand number of raid parameters" }, 2635 { 0, as.argc, "Cannot understand number of raid parameters" },
@@ -2469,7 +2688,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
2469 * requested to be able to compare to superblock 2688 * requested to be able to compare to superblock
2470 * members for conversion decisions. 2689 * members for conversion decisions.
2471 */ 2690 */
2472 rs_config_backup(rs); 2691 rs_config_backup(rs, &rs_layout);
2473 2692
2474 r = analyse_superblocks(ti, rs); 2693 r = analyse_superblocks(ti, rs);
2475 if (r) 2694 if (r)
@@ -2480,13 +2699,23 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
2480 ti->num_flush_bios = 1; 2699 ti->num_flush_bios = 1;
2481 2700
2482 /* Restore any requested new layout for conversion decision */ 2701 /* Restore any requested new layout for conversion decision */
2483 rs_config_restore(rs); 2702 rs_config_restore(rs, &rs_layout);
2484 2703
2485 /* 2704 if (test_bit(MD_ARRAY_FIRST_USE, &rs->md.flags)) {
2486 * If a takeover is needed, just set the level to 2705 set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
2487 * the new requested one and allow the raid set to run. 2706 rs_set_new(rs);
2488 */ 2707 } else if (rs_is_reshaping(rs))
2489 if (rs_takeover_requested(rs)) { 2708 ; /* skip rs setup */
2709 else if (rs_takeover_requested(rs)) {
2710 if (rs_is_reshaping(rs)) {
2711 ti->error = "Can't takeover a reshaping raid set";
2712 return -EPERM;
2713 }
2714
2715 /*
2716 * If a takeover is needed, just set the level to
2717 * the new requested one and allow the raid set to run.
2718 */
2490 r = rs_check_takeover(rs); 2719 r = rs_check_takeover(rs);
2491 if (r) 2720 if (r)
2492 return r; 2721 return r;
@@ -2495,11 +2724,55 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
2495 if (r) 2724 if (r)
2496 return r; 2725 return r;
2497 2726
2498 /* Tell preresume to update superblocks with new layout */
2499 set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); 2727 set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
2500 rs_set_new(rs); 2728 rs_set_new(rs);
2501 } else if (rs_reshape_requested(rs)) { 2729 } else if (rs_reshape_requested(rs)) {
2502 rs_set_cur(rs); /* Dummy to reject, fill in */ 2730 if (rs_is_reshaping(rs)) {
2731 ti->error = "raid set already reshaping!";
2732 return -EPERM;
2733 }
2734
2735 if (rs_is_raid10(rs)) {
2736 if (rs->raid_disks != rs->md.raid_disks &&
2737 __is_raid10_near(rs->md.layout) &&
2738 rs->raid10_copies &&
2739 rs->raid10_copies != __raid10_near_copies(rs->md.layout)) {
2740 /*
2741 * raid disk have to be multiple of data copies to allow this conversion,
2742 *
2743 * This is actually not a reshape it is a
2744 * rebuild of any additional mirrors per group
2745 */
2746 if (rs->raid_disks % rs->raid10_copies) {
2747 ti->error = "Can't reshape raid10 mirror groups";
2748 return -EINVAL;
2749 }
2750
2751 /* Userpace reordered disks to add/remove mirrors -> adjust raid_disk indexes */
2752 __reorder_raid_disk_indexes(rs);
2753 rs->md.layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
2754 rs->raid10_copies);
2755 rs->md.new_layout = rs->md.layout;
2756
2757 } else
2758 set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
2759
2760 } else if (rs_is_raid456(rs))
2761 set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
2762
2763 /*
2764 * HM FIXME: process raid1 via delta_disks as well?
2765 * Would cause allocations in raid1->check_reshape
2766 * though, thus more issues with potential failures
2767 */
2768 else if (rs_is_raid1(rs))
2769 rs->md.raid_disks = rs->raid_disks;
2770
2771 if (rs->md.raid_disks < rs->raid_disks)
2772 set_bit(MD_ARRAY_FIRST_USE, &rs->md.flags);
2773
2774 set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
2775 rs_set_cur(rs);
2503 } else 2776 } else
2504 rs_set_cur(rs); 2777 rs_set_cur(rs);
2505 2778
@@ -2517,25 +2790,46 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
2517 mddev_lock_nointr(&rs->md); 2790 mddev_lock_nointr(&rs->md);
2518 r = md_run(&rs->md); 2791 r = md_run(&rs->md);
2519 rs->md.in_sync = 0; /* Assume already marked dirty */ 2792 rs->md.in_sync = 0; /* Assume already marked dirty */
2520 mddev_unlock(&rs->md);
2521 2793
2522 if (r) { 2794 if (r) {
2523 ti->error = "Fail to run raid array"; 2795 ti->error = "Failed to run raid array";
2796 mddev_unlock(&rs->md);
2524 goto bad; 2797 goto bad;
2525 } 2798 }
2526 2799
2527 if (ti->len != rs->md.array_sectors) {
2528 ti->error = "Array size does not match requested target length";
2529 r = -EINVAL;
2530 goto size_mismatch;
2531 }
2532 rs->callbacks.congested_fn = raid_is_congested; 2800 rs->callbacks.congested_fn = raid_is_congested;
2533 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 2801 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
2534 2802
2535 mddev_suspend(&rs->md); 2803 mddev_suspend(&rs->md);
2804
2805 /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
2806 if (rs_is_raid456(rs)) {
2807 r = rs_set_raid456_stripe_cache(rs);
2808 if (r)
2809 goto bad_stripe_cache;
2810 }
2811
2812 /* Now do an early reshape check */
2813 if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
2814 r = rs_check_reshape(rs);
2815 if (r)
2816 return r;
2817
2818 /* Restore new, ctr requested layout to perform check */
2819 rs_config_restore(rs, &rs_layout);
2820
2821 r = rs->md.pers->check_reshape(&rs->md);
2822 if (r) {
2823 ti->error = "Reshape check failed";
2824 goto bad_check_reshape;
2825 }
2826 }
2827
2828 mddev_unlock(&rs->md);
2536 return 0; 2829 return 0;
2537 2830
2538size_mismatch: 2831bad_stripe_cache:
2832bad_check_reshape:
2539 md_stop(&rs->md); 2833 md_stop(&rs->md);
2540bad: 2834bad:
2541 raid_set_free(rs); 2835 raid_set_free(rs);
@@ -2557,6 +2851,17 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
2557 struct raid_set *rs = ti->private; 2851 struct raid_set *rs = ti->private;
2558 struct mddev *mddev = &rs->md; 2852 struct mddev *mddev = &rs->md;
2559 2853
2854 /*
2855 * If we're reshaping to add disk(s)), ti->len and
2856 * mddev->array_sectors will differ during the process
2857 * (ti->len > mddev->array_sectors), so we have to requeue
2858 * bios with addresses > mddev->array_sectors here or
2859 * or there will occur accesses past EOD of the component
2860 * data images thus erroring the raid set.
2861 */
2862 if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
2863 return DM_MAPIO_REQUEUE;
2864
2560 mddev->pers->make_request(mddev, bio); 2865 mddev->pers->make_request(mddev, bio);
2561 2866
2562 return DM_MAPIO_SUBMITTED; 2867 return DM_MAPIO_SUBMITTED;
@@ -2709,7 +3014,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
2709 if (!rt) 3014 if (!rt)
2710 return; 3015 return;
2711 3016
2712 DMEMIT("%s %d ", rt ? rt->name : "unknown", mddev->raid_disks); 3017 DMEMIT("%s %d ", rt->name, mddev->raid_disks);
2713 3018
2714 /* Access most recent mddev properties for status output */ 3019 /* Access most recent mddev properties for status output */
2715 smp_rmb(); 3020 smp_rmb();
@@ -2718,7 +3023,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
2718 mddev->resync_max_sectors : mddev->dev_sectors; 3023 mddev->resync_max_sectors : mddev->dev_sectors;
2719 progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync); 3024 progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync);
2720 resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? 3025 resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
2721 (unsigned int) atomic64_read(&mddev->resync_mismatches) : 0; 3026 atomic64_read(&mddev->resync_mismatches) : 0;
2722 sync_action = decipher_sync_action(&rs->md); 3027 sync_action = decipher_sync_action(&rs->md);
2723 3028
2724 /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */ 3029 /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
@@ -2925,6 +3230,8 @@ static void raid_postsuspend(struct dm_target *ti)
2925 struct raid_set *rs = ti->private; 3230 struct raid_set *rs = ti->private;
2926 3231
2927 mddev_suspend(&rs->md); 3232 mddev_suspend(&rs->md);
3233 rs->md.ro = 1;
3234 clear_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags);
2928} 3235}
2929 3236
2930static void attempt_restore_of_faulty_devices(struct raid_set *rs) 3237static void attempt_restore_of_faulty_devices(struct raid_set *rs)
@@ -2999,8 +3306,64 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
2999 return r; 3306 return r;
3000} 3307}
3001 3308
3309/*
3310 * Reshape changes raid algorithm of @rs to new one within personality
3311 * (e.g. raid6_zr -> raid6_nc), changes stripe size, adds/removes
3312 * disks from a raid set thus growing/shrinking it or resizes the set
3313 *
3314 * Call mddev_lock_nointr() before!
3315 */
3316static int rs_start_reshape(struct raid_set *rs)
3317{
3318 int r;
3319 struct mddev *mddev = &rs->md;
3320 struct md_personality *pers = mddev->pers;
3321
3322 r = rs_setup_reshape(rs);
3323 if (r)
3324 return r;
3325
3326 /* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */
3327 if (mddev->suspended)
3328 mddev_resume(mddev);
3329
3330 /*
3331 * Check any reshape constraints enforced by the personalility
3332 *
3333 * May as well already kick the reshape off so that * pers->start_reshape() becomes optional.
3334 */
3335 r = pers->check_reshape(mddev);
3336 if (r) {
3337 rs->ti->error = "pers->check_reshape() failed";
3338 return r;
3339 }
3340
3341 /*
3342 * Personality may not provide start reshape method in which
3343 * case check_reshape above has already covered everything
3344 */
3345 if (pers->start_reshape) {
3346 r = pers->start_reshape(mddev);
3347 if (r) {
3348 rs->ti->error = "pers->start_reshape() failed";
3349 return r;
3350 }
3351 }
3352
3353 /* Suspend because a resume will happen in raid_resume() */
3354 if (!mddev->suspended)
3355 mddev_suspend(mddev);
3356
3357 mddev->ro = 0;
3358 md_update_sb(mddev, 1);
3359 mddev->ro = 1;
3360
3361 return 0;
3362}
3363
3002static int raid_preresume(struct dm_target *ti) 3364static int raid_preresume(struct dm_target *ti)
3003{ 3365{
3366 int r;
3004 struct raid_set *rs = ti->private; 3367 struct raid_set *rs = ti->private;
3005 struct mddev *mddev = &rs->md; 3368 struct mddev *mddev = &rs->md;
3006 3369
@@ -3034,7 +3397,33 @@ static int raid_preresume(struct dm_target *ti)
3034 configure_discard_support(rs); 3397 configure_discard_support(rs);
3035 3398
3036 /* Load the bitmap from disk unless raid0 */ 3399 /* Load the bitmap from disk unless raid0 */
3037 return __load_dirty_region_bitmap(rs); 3400 r = __load_dirty_region_bitmap(rs);
3401 if (r)
3402 return r;
3403
3404 /* Check for any resize/reshape on @rs and adjust/initiate */
3405 /* Be prepared for mddev_resume() in raid_resume() */
3406 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3407 if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
3408 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3409 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3410 mddev->resync_min = mddev->recovery_cp;
3411 }
3412
3413 rs_set_capacity(rs);
3414
3415 /* Check for any reshape request and region size change unless new raid set */
3416 if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
3417 /* Initiate a reshape. */
3418 mddev_lock_nointr(mddev);
3419 r = rs_start_reshape(rs);
3420 mddev_unlock(mddev);
3421 if (r)
3422 DMWARN("Failed to check/start reshape, continuing without change");
3423 r = 0;
3424 }
3425
3426 return r;
3038} 3427}
3039 3428
3040static void raid_resume(struct dm_target *ti) 3429static void raid_resume(struct dm_target *ti)