aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHeinz Mauelshagen <heinzm@redhat.com>2017-03-22 12:44:38 -0400
committerMike Snitzer <snitzer@redhat.com>2017-03-27 12:08:07 -0400
commit6e53636fe81465d6810f4e0910e7238edf12a133 (patch)
tree38be83f8fc4476c3fb4debce0b8135ed07854c1c
parent4464e36e06470e3d68dc26a874f0dbdffa09a6e8 (diff)
dm raid: add raid4/5/6 journal write-back support via journal_mode option
Commit 63c32ed4afc ("dm raid: add raid4/5/6 journaling support") added journal support to close the raid4/5/6 "write hole" -- in terms of writethrough caching. Introduce a "journal_mode" feature and use the new r5c_journal_mode_set() API to add support for switching the journal device's cache mode between write-through (the current default) and write-back. NOTE: If the journal device is not layered on resilent storage and it fails, write-through mode will cause the "write hole" to reoccur. But if the journal fails while in write-back mode it will cause data loss for any dirty cache entries unless resilent storage is used for the journal. Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
-rw-r--r--Documentation/device-mapper/dm-raid.txt11
-rw-r--r--drivers/md/dm-raid.c104
2 files changed, 101 insertions, 14 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 95c4c8dd6dd1..7e06e65586d4 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters:
170 Takeover/reshape is not possible with a raid4/5/6 journal device; 170 Takeover/reshape is not possible with a raid4/5/6 journal device;
171 it has to be deconfigured before requesting these. 171 it has to be deconfigured before requesting these.
172 172
173 [journal_mode <mode>]
174 This option sets the caching mode on journaled raid4/5/6 raid sets
175 (see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
176 If 'writeback' is selected the journal device has to be resilient
177 and must not suffer from the 'write hole' problem itself (e.g. use
178 raid1 or raid10) to avoid a single point of failure.
179
173<#raid_devs>: The number of devices composing the array. 180<#raid_devs>: The number of devices composing the array.
174 Each device consists of two entries. The first is the device 181 Each device consists of two entries. The first is the device
175 containing the metadata (if any); the second is the one containing the 182 containing the metadata (if any); the second is the one containing the
@@ -254,7 +261,8 @@ recovery. Here is a fuller description of the individual fields:
254 <data_offset> The current data offset to the start of the user data on 261 <data_offset> The current data offset to the start of the user data on
255 each component device of a raid set (see the respective 262 each component device of a raid set (see the respective
256 raid parameter to support out-of-place reshaping). 263 raid parameter to support out-of-place reshaping).
257 <journal_char> 'A' - active raid4/5/6 journal device. 264 <journal_char> 'A' - active write-through journal device.
265 'a' - active write-back journal device.
258 'D' - dead journal device. 266 'D' - dead journal device.
259 '-' - no journal device. 267 '-' - no journal device.
260 268
@@ -334,3 +342,4 @@ Version History
3341.10.1 Fix data corruption on reshape request 3421.10.1 Fix data corruption on reshape request
3351.11.0 Fix table line argument order 3431.11.0 Fix table line argument order
336 (wrong raid10_copies/raid10_format sequence) 344 (wrong raid10_copies/raid10_format sequence)
3451.11.1 Add raid4/5/6 journal write-back support via journal_mode option
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e07185fca638..0f61bb659b73 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2010-2011 Neil Brown 2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -79,7 +79,10 @@ struct raid_dev {
79#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ 79#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
80 80
81/* New for v1.10.0 */ 81/* New for v1.10.0 */
82#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */ 82#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
83
84/* New for v1.11.1 */
85#define __CTR_FLAG_JOURNAL_MODE 16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */
83 86
84/* 87/*
85 * Flags for rs->ctr_flags field. 88 * Flags for rs->ctr_flags field.
@@ -100,6 +103,7 @@ struct raid_dev {
100#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) 103#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET)
101#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) 104#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
102#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) 105#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
106#define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE)
103 107
104#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) 108#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
105 109
@@ -175,7 +179,8 @@ struct raid_dev {
175 CTR_FLAG_REGION_SIZE | \ 179 CTR_FLAG_REGION_SIZE | \
176 CTR_FLAG_DELTA_DISKS | \ 180 CTR_FLAG_DELTA_DISKS | \
177 CTR_FLAG_DATA_OFFSET | \ 181 CTR_FLAG_DATA_OFFSET | \
178 CTR_FLAG_JOURNAL_DEV) 182 CTR_FLAG_JOURNAL_DEV | \
183 CTR_FLAG_JOURNAL_MODE)
179 184
180#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ 185#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \
181 CTR_FLAG_REBUILD | \ 186 CTR_FLAG_REBUILD | \
@@ -186,7 +191,8 @@ struct raid_dev {
186 CTR_FLAG_REGION_SIZE | \ 191 CTR_FLAG_REGION_SIZE | \
187 CTR_FLAG_DELTA_DISKS | \ 192 CTR_FLAG_DELTA_DISKS | \
188 CTR_FLAG_DATA_OFFSET | \ 193 CTR_FLAG_DATA_OFFSET | \
189 CTR_FLAG_JOURNAL_DEV) 194 CTR_FLAG_JOURNAL_DEV | \
195 CTR_FLAG_JOURNAL_MODE)
190/* ...valid options definitions per raid level */ 196/* ...valid options definitions per raid level */
191 197
192/* 198/*
@@ -239,6 +245,7 @@ struct raid_set {
239 struct journal_dev { 245 struct journal_dev {
240 struct dm_dev *dev; 246 struct dm_dev *dev;
241 struct md_rdev rdev; 247 struct md_rdev rdev;
248 int mode;
242 } journal_dev; 249 } journal_dev;
243 250
244 struct raid_dev dev[0]; 251 struct raid_dev dev[0];
@@ -326,6 +333,7 @@ static struct arg_name_flag {
326 { CTR_FLAG_DELTA_DISKS, "delta_disks"}, 333 { CTR_FLAG_DELTA_DISKS, "delta_disks"},
327 { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, 334 { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
328 { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, 335 { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
336 { CTR_FLAG_JOURNAL_MODE, "journal_mode" },
329}; 337};
330 338
331/* Return argument name string for given @flag */ 339/* Return argument name string for given @flag */
@@ -344,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
344 return NULL; 352 return NULL;
345} 353}
346 354
355/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
356static struct {
357 const int mode;
358 const char *param;
359} _raid456_journal_mode[] = {
360 { R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
361 { R5C_JOURNAL_MODE_WRITE_BACK , "writeback" }
362};
363
364/* Return MD raid4/5/6 journal mode for dm @journal_mode one */
365static int dm_raid_journal_mode_to_md(const char *mode)
366{
367 int m = ARRAY_SIZE(_raid456_journal_mode);
368
369 while (m--)
370 if (!strcasecmp(mode, _raid456_journal_mode[m].param))
371 return _raid456_journal_mode[m].mode;
372
373 return -EINVAL;
374}
375
376/* Return dm-raid raid4/5/6 journal mode string for @mode */
377static const char *md_journal_mode_to_dm_raid(const int mode)
378{
379 int m = ARRAY_SIZE(_raid456_journal_mode);
380
381 while (m--)
382 if (mode == _raid456_journal_mode[m].mode)
383 return _raid456_journal_mode[m].param;
384
385 return "unknown";
386}
387
347/* 388/*
348 * Bool helpers to test for various raid levels of a raid set. 389 * Bool helpers to test for various raid levels of a raid set.
349 * It's level as reported by the superblock rather than 390 * It's level as reported by the superblock rather than
@@ -1183,7 +1224,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1183 continue; 1224 continue;
1184 } 1225 }
1185 1226
1186 /* "journal_dev dev" */ 1227 /* "journal_dev <dev>" */
1187 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { 1228 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
1188 int r; 1229 int r;
1189 struct md_rdev *jdev; 1230 struct md_rdev *jdev;
@@ -1211,10 +1252,32 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
1211 rs->ti->error = "No space for raid4/5/6 journal"; 1252 rs->ti->error = "No space for raid4/5/6 journal";
1212 return -ENOSPC; 1253 return -ENOSPC;
1213 } 1254 }
1255 rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
1214 set_bit(Journal, &jdev->flags); 1256 set_bit(Journal, &jdev->flags);
1215 continue; 1257 continue;
1216 } 1258 }
1217 1259
1260 /* "journal_mode <mode>" ("journal_dev" mandatory!) */
1261 if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
1262 int r;
1263
1264 if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
1265 rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
1266 return -EINVAL;
1267 }
1268 if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
1269 rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
1270 return -EINVAL;
1271 }
1272 r = dm_raid_journal_mode_to_md(arg);
1273 if (r < 0) {
1274 rs->ti->error = "Invalid 'journal_mode' argument";
1275 return r;
1276 }
1277 rs->journal_dev.mode = r;
1278 continue;
1279 }
1280
1218 /* 1281 /*
1219 * Parameters with number values from here on. 1282 * Parameters with number values from here on.
1220 */ 1283 */
@@ -3076,6 +3139,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
3076 rs->callbacks.congested_fn = raid_is_congested; 3139 rs->callbacks.congested_fn = raid_is_congested;
3077 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 3140 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
3078 3141
3142 /* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
3143 if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
3144 r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
3145 if (r) {
3146 ti->error = "Failed to set raid4/5/6 journal mode";
3147 mddev_unlock(&rs->md);
3148 goto bad_journal_mode_set;
3149 }
3150 }
3151
3079 mddev_suspend(&rs->md); 3152 mddev_suspend(&rs->md);
3080 3153
3081 /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ 3154 /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@ -3109,6 +3182,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
3109 mddev_unlock(&rs->md); 3182 mddev_unlock(&rs->md);
3110 return 0; 3183 return 0;
3111 3184
3185bad_journal_mode_set:
3112bad_stripe_cache: 3186bad_stripe_cache:
3113bad_check_reshape: 3187bad_check_reshape:
3114 md_stop(&rs->md); 3188 md_stop(&rs->md);
@@ -3180,18 +3254,18 @@ static const char *decipher_sync_action(struct mddev *mddev)
3180 * Status characters: 3254 * Status characters:
3181 * 3255 *
3182 * 'D' = Dead/Failed raid set component or raid4/5/6 journal device 3256 * 'D' = Dead/Failed raid set component or raid4/5/6 journal device
3183 * 'a' = Alive but not in-sync 3257 * 'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
3184 * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device 3258 * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
3185 * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) 3259 * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
3186 */ 3260 */
3187static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) 3261static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
3188{ 3262{
3189 if (!rdev->bdev) 3263 if (!rdev->bdev)
3190 return "-"; 3264 return "-";
3191 else if (test_bit(Faulty, &rdev->flags)) 3265 else if (test_bit(Faulty, &rdev->flags))
3192 return "D"; 3266 return "D";
3193 else if (test_bit(Journal, &rdev->flags)) 3267 else if (test_bit(Journal, &rdev->flags))
3194 return "A"; 3268 return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
3195 else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) 3269 else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
3196 return "a"; 3270 return "a";
3197 else 3271 else
@@ -3315,7 +3389,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3315 3389
3316 /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ 3390 /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
3317 for (i = 0; i < rs->raid_disks; i++) 3391 for (i = 0; i < rs->raid_disks; i++)
3318 DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync)); 3392 DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
3319 3393
3320 /* 3394 /*
3321 * In-sync/Reshape ratio: 3395 * In-sync/Reshape ratio:
@@ -3366,7 +3440,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3366 * v1.10.0+: 3440 * v1.10.0+:
3367 */ 3441 */
3368 DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 3442 DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
3369 __raid_dev_status(&rs->journal_dev.rdev, 0) : "-"); 3443 __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
3370 break; 3444 break;
3371 3445
3372 case STATUSTYPE_TABLE: 3446 case STATUSTYPE_TABLE:
@@ -3381,7 +3455,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3381 write_mostly_params + 3455 write_mostly_params +
3382 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + 3456 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
3383 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + 3457 hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
3384 (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0); 3458 (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
3459 (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
3385 3460
3386 /* Emit table line */ 3461 /* Emit table line */
3387 /* This has to be in the documented order for userspace! */ 3462 /* This has to be in the documented order for userspace! */
@@ -3433,6 +3508,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
3433 if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) 3508 if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
3434 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), 3509 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
3435 __get_dev_name(rs->journal_dev.dev)); 3510 __get_dev_name(rs->journal_dev.dev));
3511 if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
3512 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
3513 md_journal_mode_to_dm_raid(rs->journal_dev.mode));
3436 DMEMIT(" %d", rs->raid_disks); 3514 DMEMIT(" %d", rs->raid_disks);
3437 for (i = 0; i < rs->raid_disks; i++) 3515 for (i = 0; i < rs->raid_disks; i++)
3438 DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), 3516 DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3793,7 +3871,7 @@ static void raid_resume(struct dm_target *ti)
3793 3871
3794static struct target_type raid_target = { 3872static struct target_type raid_target = {
3795 .name = "raid", 3873 .name = "raid",
3796 .version = {1, 11, 0}, 3874 .version = {1, 11, 1},
3797 .module = THIS_MODULE, 3875 .module = THIS_MODULE,
3798 .ctr = raid_ctr, 3876 .ctr = raid_ctr,
3799 .dtr = raid_dtr, 3877 .dtr = raid_dtr,