dm raid: add support for the MD RAID0 personality

Add dm-raid access to the MD RAID0 personality to enable single zone striping. The following changes enable that access: - add type definition to raid_types array - make bitmap creation conditonal in super_validate(), because bitmaps are not allowed in raid0 - set rdev->sectors to the data image size in super_validate() to allow the raid0 personality to calculate the MD array size properly - use mdddev(un)lock() functions instead of direct mutex_(un)lock() (wrapped in here because it's a trivial change) - enhance raid_status() to always report full sync for raid0 so that userspace checks for 100% sync will succeed and allow for resize (and takeover/reshape once added in future paches) - enhance raid_resume() to not load bitmap in case of raid0 - add merge function to avoid data corruption (seen with readahead) that resulted from bio payloads that grew too large. This problem did not occur with the other raid levels because it either did not apply without striping (raid1) or was avoided via stripe caching. - raise version to 1.7.0 because of the raid0 API change Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com> Reviewed-by: Jonathan Brassow <jbrassow@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
author: Heinz Mauelshagen <heinzm@redhat.com> 2015-04-29 08:03:04 -0400
committer: Mike Snitzer <snitzer@redhat.com> 2015-05-29 14:19:00 -0400
commit: 0cf4503174c12025ac7ea61048cb7c1d4d1ed85c (patch)
tree: 52b88a85b6bd98f5156ecd2c1e573e04fcb7c59a /drivers/md/dm-raid.c
parent: c76d53f43ec4f9b9f200f031d303f21bdf6927d0 (diff)
1 files changed, 84 insertions, 48 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index af49ddebaa62..2daa67793511 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
 *
 * This file is released under the GPL.
 */
@@ -82,6 +82,7 @@ static struct raid_type {
        const unsigned level;           /* RAID level. */
        const unsigned algorithm;       /* RAID algorithm. */
 } raid_types[] = {
+        {"raid0",    "RAID0 (striping)",                0, 2, 0, 0 /* NONE */},
        {"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
        {"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
        {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
@@ -719,7 +720,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                rs->md.layout = raid10_format_to_md_layout(raid10_format,
                                                           raid10_copies);
                rs->md.new_layout = rs->md.layout;
-        } else if ((rs->raid_type->level > 1) &&
+        } else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
                   sector_div(sectors_per_dev,
                              (rs->md.raid_disks - rs->raid_type->parity_devs))) {
                rs->ti->error = "Target length not divisible by number of data devices";
@@ -1025,8 +1026,9 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
        return 0;
 }
-static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 {
+        struct mddev *mddev = &rs->md;
        struct dm_raid_superblock *sb = page_address(rdev->sb_page);
        /*
@@ -1036,8 +1038,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
        if (!mddev->events && super_init_validation(mddev, rdev))
                return -EINVAL;
-        mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
+        /* Enable bitmap creation for RAID levels != 0 */
-        rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+        mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
+        rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
        if (!test_bit(FirstUse, &rdev->flags)) {
                rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
                if (rdev->recovery_offset != MaxSector)
@@ -1081,6 +1085,8 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
                 * that the "sync" directive is disallowed during the
                 * reshape.
                 */
+                rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
                if (rs->ctr_flags & CTR_FLAG_SYNC)
                        continue;
@@ -1139,11 +1145,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
         * validation for the remaining devices.
         */
        ti->error = "Unable to assemble array: Invalid superblocks";
-        if (super_validate(mddev, freshest))
+        if (super_validate(rs, freshest))
                return -EINVAL;
        rdev_for_each(rdev, mddev)
-                if ((rdev != freshest) && super_validate(mddev, rdev))
+                if ((rdev != freshest) && super_validate(rs, rdev))
                        return -EINVAL;
        return 0;
@@ -1281,10 +1287,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
         */
        configure_discard_support(ti, rs);
-        mutex_lock(&rs->md.reconfig_mutex);
+        /* Has to be held on running the array */
+        mddev_lock_nointr(&rs->md);
        ret = md_run(&rs->md);
        rs->md.in_sync = 0; /* Assume already marked dirty */
-        mutex_unlock(&rs->md.reconfig_mutex);
+        mddev_unlock(&rs->md);
        if (ret) {
                ti->error = "Fail to run raid array";
@@ -1367,34 +1374,40 @@ static void raid_status(struct dm_target *ti, status_type_t type,
        case STATUSTYPE_INFO:
                DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
-                if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+                if (rs->raid_type->level) {
-                        sync = rs->md.curr_resync_completed;
+                        if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
-                else
+                                sync = rs->md.curr_resync_completed;
-                        sync = rs->md.recovery_cp;
+                        else
+                                sync = rs->md.recovery_cp;
-                if (sync >= rs->md.resync_max_sectors) {
-                        /*
+                        if (sync >= rs->md.resync_max_sectors) {
-                         * Sync complete.
+                                /*
-                         */
+                                 * Sync complete.
+                                 */
+                                array_in_sync = 1;
+                                sync = rs->md.resync_max_sectors;
+                        } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+                                /*
+                                 * If "check" or "repair" is occurring, the array has
+                                 * undergone and initial sync and the health characters
+                                 * should not be 'a' anymore.
+                                 */
+                                array_in_sync = 1;
+                        } else {
+                                /*
+                                 * The array may be doing an initial sync, or it may
+                                 * be rebuilding individual components.  If all the
+                                 * devices are In_sync, then it is the array that is
+                                 * being initialized.
+                                 */
+                                for (i = 0; i < rs->md.raid_disks; i++)
+                                        if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+                                                array_in_sync = 1;
+                        }
+                } else {
+                        /* RAID0 */
                        array_in_sync = 1;
                        sync = rs->md.resync_max_sectors;
-                } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
-                        /*
-                         * If "check" or "repair" is occurring, the array has
-                         * undergone and initial sync and the health characters
-                         * should not be 'a' anymore.
-                         */
-                        array_in_sync = 1;
-                } else {
-                        /*
-                         * The array may be doing an initial sync, or it may
-                         * be rebuilding individual components.  If all the
-                         * devices are In_sync, then it is the array that is
-                         * being initialized.
-                         */
-                        for (i = 0; i < rs->md.raid_disks; i++)
-                                if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
-                                        array_in_sync = 1;
                }
                /*
@@ -1683,26 +1696,48 @@ static void raid_resume(struct dm_target *ti)
 {
        struct raid_set *rs = ti->private;
-        set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+        if (rs->raid_type->level) {
-        if (!rs->bitmap_loaded) {
+                set_bit(MD_CHANGE_DEVS, &rs->md.flags);
-                bitmap_load(&rs->md);
-                rs->bitmap_loaded = 1;
+                if (!rs->bitmap_loaded) {
-        } else {
+                        bitmap_load(&rs->md);
-                /*
+                        rs->bitmap_loaded = 1;
-                 * A secondary resume while the device is active.
+                } else {
-                 * Take this opportunity to check whether any failed
+                        /*
-                 * devices are reachable again.
+                         * A secondary resume while the device is active.
-                 */
+                         * Take this opportunity to check whether any failed
-                attempt_restore_of_faulty_devices(rs);
+                         * devices are reachable again.
+                         */
+                        attempt_restore_of_faulty_devices(rs);
+                }
+                clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
        }
-        clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
        mddev_resume(&rs->md);
 }
+static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+                      struct bio_vec *biovec, int max_size)
+{
+        struct raid_set *rs = ti->private;
+        struct md_personality *pers = rs->md.pers;
+        if (pers && pers->mergeable_bvec)
+                return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
+        /*
+         * In case we can't request the personality because
+         * the raid set is not running yet
+         *
+         * -> return safe minimum
+         */
+        return rs->md.chunk_sectors;
+}
 static struct target_type raid_target = {
        .name = "raid",
-        .version = {1, 6, 0},
+        .version = {1, 7, 0},
        .module = THIS_MODULE,
        .ctr = raid_ctr,
        .dtr = raid_dtr,
@@ -1714,6 +1749,7 @@ static struct target_type raid_target = {
        .presuspend = raid_presuspend,
        .postsuspend = raid_postsuspend,
        .resume = raid_resume,
+        .merge = raid_merge,
 };
 static int __init dm_raid_init(void)
author	Heinz Mauelshagen <heinzm@redhat.com>	2015-04-29 08:03:04 -0400
committer	Mike Snitzer <snitzer@redhat.com>	2015-05-29 14:19:00 -0400
commit	0cf4503174c12025ac7ea61048cb7c1d4d1ed85c (patch)
tree	52b88a85b6bd98f5156ecd2c1e573e04fcb7c59a /drivers/md/dm-raid.c
parent	c76d53f43ec4f9b9f200f031d303f21bdf6927d0 (diff)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index af49ddebaa62..2daa67793511 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
1	/*	1	/*
2	* Copyright (C) 2010-2011 Neil Brown	2	* Copyright (C) 2010-2011 Neil Brown
3	* Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.	3	* Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
4	*	4	*
5	* This file is released under the GPL.	5	* This file is released under the GPL.
6	*/	6	*/
@@ -82,6 +82,7 @@ static struct raid_type {
82	const unsigned level; /* RAID level. */	82	const unsigned level; /* RAID level. */
83	const unsigned algorithm; /* RAID algorithm. */	83	const unsigned algorithm; /* RAID algorithm. */
84	} raid_types[] = {	84	} raid_types[] = {
		85	{"raid0", "RAID0 (striping)", 0, 2, 0, 0 /* NONE */},
85	{"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},	86	{"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
86	{"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},	87	{"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
87	{"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},	88	{"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
@@ -719,7 +720,7 @@ static int parse_raid_params(struct raid_set rs, char *argv,
719	rs->md.layout = raid10_format_to_md_layout(raid10_format,	720	rs->md.layout = raid10_format_to_md_layout(raid10_format,
720	raid10_copies);	721	raid10_copies);
721	rs->md.new_layout = rs->md.layout;	722	rs->md.new_layout = rs->md.layout;
722	} else if ((rs->raid_type->level > 1) &&	723	} else if ((!rs->raid_type->level \|\| rs->raid_type->level > 1) &&
723	sector_div(sectors_per_dev,	724	sector_div(sectors_per_dev,
724	(rs->md.raid_disks - rs->raid_type->parity_devs))) {	725	(rs->md.raid_disks - rs->raid_type->parity_devs))) {
725	rs->ti->error = "Target length not divisible by number of data devices";	726	rs->ti->error = "Target length not divisible by number of data devices";
@@ -1025,8 +1026,9 @@ static int super_init_validation(struct mddev mddev, struct md_rdev rdev)
1025	return 0;	1026	return 0;
1026	}	1027	}
1027		1028
1028	static int super_validate(struct mddev mddev, struct md_rdev rdev)	1029	static int super_validate(struct raid_set rs, struct md_rdev rdev)
1029	{	1030	{
		1031	struct mddev *mddev = &rs->md;
1030	struct dm_raid_superblock *sb = page_address(rdev->sb_page);	1032	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
1031		1033
1032	/*	1034	/*
@@ -1036,8 +1038,10 @@ static int super_validate(struct mddev mddev, struct md_rdev rdev)
1036	if (!mddev->events && super_init_validation(mddev, rdev))	1038	if (!mddev->events && super_init_validation(mddev, rdev))
1037	return -EINVAL;	1039	return -EINVAL;
1038		1040
1039	mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */	1041	/* Enable bitmap creation for RAID levels != 0 */
1040	rdev->mddev->bitmap_info.default_offset = 4096 >> 9;	1042	mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
		1043	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
		1044
1041	if (!test_bit(FirstUse, &rdev->flags)) {	1045	if (!test_bit(FirstUse, &rdev->flags)) {
1042	rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);	1046	rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
1043	if (rdev->recovery_offset != MaxSector)	1047	if (rdev->recovery_offset != MaxSector)
@@ -1081,6 +1085,8 @@ static int analyse_superblocks(struct dm_target ti, struct raid_set rs)
1081	* that the "sync" directive is disallowed during the	1085	* that the "sync" directive is disallowed during the
1082	* reshape.	1086	* reshape.
1083	*/	1087	*/
		1088	rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
		1089
1084	if (rs->ctr_flags & CTR_FLAG_SYNC)	1090	if (rs->ctr_flags & CTR_FLAG_SYNC)
1085	continue;	1091	continue;
1086		1092
@@ -1139,11 +1145,11 @@ static int analyse_superblocks(struct dm_target ti, struct raid_set rs)
1139	* validation for the remaining devices.	1145	* validation for the remaining devices.
1140	*/	1146	*/
1141	ti->error = "Unable to assemble array: Invalid superblocks";	1147	ti->error = "Unable to assemble array: Invalid superblocks";
1142	if (super_validate(mddev, freshest))	1148	if (super_validate(rs, freshest))
1143	return -EINVAL;	1149	return -EINVAL;
1144		1150
1145	rdev_for_each(rdev, mddev)	1151	rdev_for_each(rdev, mddev)
1146	if ((rdev != freshest) && super_validate(mddev, rdev))	1152	if ((rdev != freshest) && super_validate(rs, rdev))
1147	return -EINVAL;	1153	return -EINVAL;
1148		1154
1149	return 0;	1155	return 0;
@@ -1281,10 +1287,11 @@ static int raid_ctr(struct dm_target ti, unsigned argc, char *argv)
1281	*/	1287	*/
1282	configure_discard_support(ti, rs);	1288	configure_discard_support(ti, rs);
1283		1289
1284	mutex_lock(&rs->md.reconfig_mutex);	1290	/* Has to be held on running the array */
		1291	mddev_lock_nointr(&rs->md);
1285	ret = md_run(&rs->md);	1292	ret = md_run(&rs->md);
1286	rs->md.in_sync = 0; /* Assume already marked dirty */	1293	rs->md.in_sync = 0; /* Assume already marked dirty */
1287	mutex_unlock(&rs->md.reconfig_mutex);	1294	mddev_unlock(&rs->md);
1288		1295
1289	if (ret) {	1296	if (ret) {
1290	ti->error = "Fail to run raid array";	1297	ti->error = "Fail to run raid array";
@@ -1367,34 +1374,40 @@ static void raid_status(struct dm_target *ti, status_type_t type,
1367	case STATUSTYPE_INFO:	1374	case STATUSTYPE_INFO:
1368	DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);	1375	DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
1369		1376
1370	if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))	1377	if (rs->raid_type->level) {
1371	sync = rs->md.curr_resync_completed;	1378	if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
1372	else	1379	sync = rs->md.curr_resync_completed;
1373	sync = rs->md.recovery_cp;	1380	else
1374		1381	sync = rs->md.recovery_cp;
1375	if (sync >= rs->md.resync_max_sectors) {	1382
1376	/*	1383	if (sync >= rs->md.resync_max_sectors) {
1377	* Sync complete.	1384	/*
1378	*/	1385	* Sync complete.
		1386	*/
		1387	array_in_sync = 1;
		1388	sync = rs->md.resync_max_sectors;
		1389	} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
		1390	/*
		1391	* If "check" or "repair" is occurring, the array has
		1392	* undergone and initial sync and the health characters
		1393	* should not be 'a' anymore.
		1394	*/
		1395	array_in_sync = 1;
		1396	} else {
		1397	/*
		1398	* The array may be doing an initial sync, or it may
		1399	* be rebuilding individual components. If all the
		1400	* devices are In_sync, then it is the array that is
		1401	* being initialized.
		1402	*/
		1403	for (i = 0; i < rs->md.raid_disks; i++)
		1404	if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
		1405	array_in_sync = 1;
		1406	}
		1407	} else {
		1408	/* RAID0 */
1379	array_in_sync = 1;	1409	array_in_sync = 1;
1380	sync = rs->md.resync_max_sectors;	1410	sync = rs->md.resync_max_sectors;
1381	} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
1382	/*
1383	* If "check" or "repair" is occurring, the array has
1384	* undergone and initial sync and the health characters
1385	* should not be 'a' anymore.
1386	*/
1387	array_in_sync = 1;
1388	} else {
1389	/*
1390	* The array may be doing an initial sync, or it may
1391	* be rebuilding individual components. If all the
1392	* devices are In_sync, then it is the array that is
1393	* being initialized.
1394	*/
1395	for (i = 0; i < rs->md.raid_disks; i++)
1396	if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
1397	array_in_sync = 1;
1398	}	1411	}
1399		1412
1400	/*	1413	/*
@@ -1683,26 +1696,48 @@ static void raid_resume(struct dm_target *ti)
1683	{	1696	{
1684	struct raid_set *rs = ti->private;	1697	struct raid_set *rs = ti->private;
1685		1698
1686	set_bit(MD_CHANGE_DEVS, &rs->md.flags);	1699	if (rs->raid_type->level) {
1687	if (!rs->bitmap_loaded) {	1700	set_bit(MD_CHANGE_DEVS, &rs->md.flags);
1688	bitmap_load(&rs->md);	1701
1689	rs->bitmap_loaded = 1;	1702	if (!rs->bitmap_loaded) {
1690	} else {	1703	bitmap_load(&rs->md);
1691	/*	1704	rs->bitmap_loaded = 1;
1692	* A secondary resume while the device is active.	1705	} else {
1693	* Take this opportunity to check whether any failed	1706	/*
1694	* devices are reachable again.	1707	* A secondary resume while the device is active.
1695	*/	1708	* Take this opportunity to check whether any failed
1696	attempt_restore_of_faulty_devices(rs);	1709	* devices are reachable again.
		1710	*/
		1711	attempt_restore_of_faulty_devices(rs);
		1712	}
		1713
		1714	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
1697	}	1715	}
1698		1716
1699	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
1700	mddev_resume(&rs->md);	1717	mddev_resume(&rs->md);
1701	}	1718	}
1702		1719
		1720	static int raid_merge(struct dm_target ti, struct bvec_merge_data bvm,
		1721	struct bio_vec *biovec, int max_size)
		1722	{
		1723	struct raid_set *rs = ti->private;
		1724	struct md_personality *pers = rs->md.pers;
		1725
		1726	if (pers && pers->mergeable_bvec)
		1727	return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
		1728
		1729	/*
		1730	* In case we can't request the personality because
		1731	* the raid set is not running yet
		1732	*
		1733	* -> return safe minimum
		1734	*/
		1735	return rs->md.chunk_sectors;
		1736	}
		1737
1703	static struct target_type raid_target = {	1738	static struct target_type raid_target = {
1704	.name = "raid",	1739	.name = "raid",
1705	.version = {1, 6, 0},	1740	.version = {1, 7, 0},
1706	.module = THIS_MODULE,	1741	.module = THIS_MODULE,
1707	.ctr = raid_ctr,	1742	.ctr = raid_ctr,
1708	.dtr = raid_dtr,	1743	.dtr = raid_dtr,
@@ -1714,6 +1749,7 @@ static struct target_type raid_target = {
1714	.presuspend = raid_presuspend,	1749	.presuspend = raid_presuspend,
1715	.postsuspend = raid_postsuspend,	1750	.postsuspend = raid_postsuspend,
1716	.resume = raid_resume,	1751	.resume = raid_resume,
		1752	.merge = raid_merge,
1717	};	1753	};
1718		1754
1719	static int __init dm_raid_init(void)	1755	static int __init dm_raid_init(void)