aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJonathan Brassow <jbrassow@redhat.com>2011-08-02 07:32:07 -0400
committerAlasdair G Kergon <agk@redhat.com>2011-08-02 07:32:07 -0400
commitb12d437b73d32203a41fde0d407e91812c866844 (patch)
tree3c7a33f4a45779da4a5edb71678ce7f8ae4169e7 /drivers/md
parent46bed2b5c16bb7c82e1088d7ae75fb958c8a8c4e (diff)
dm raid: support metadata devices
Add the ability to parse and use metadata devices to dm-raid. Although not strictly required, without the metadata devices, many features of RAID are unavailable. They are used to store a superblock and bitmap. The role, or position in the array, of each device must be recorded in its superblock. This is to help with fault handling, array reshaping, and sanity checks. RAID 4/5/6 devices must be loaded in a specific order: in this way, the 'array_position' field helps validate the correctness of the mapping when it is loaded. It can be used during reshaping to identify which devices are added/removed. Fault handling is impossible without this field. For example, when a device fails it is recorded in the superblock. If this is a RAID1 device and the offending device is removed from the array, there must be a way during subsequent array assembly to determine that the failed device was the one removed. This is done by correlating the 'array_position' field and the bit-field variable 'failed_devices'. Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig5
-rw-r--r--drivers/md/dm-raid.c419
2 files changed, 406 insertions, 18 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8420129fc5ee..f75a66e7d312 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -241,12 +241,13 @@ config DM_MIRROR
241 needed for live data migration tools such as 'pvmove'. 241 needed for live data migration tools such as 'pvmove'.
242 242
243config DM_RAID 243config DM_RAID
244 tristate "RAID 4/5/6 target (EXPERIMENTAL)" 244 tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
245 depends on BLK_DEV_DM && EXPERIMENTAL 245 depends on BLK_DEV_DM && EXPERIMENTAL
246 select MD_RAID1
246 select MD_RAID456 247 select MD_RAID456
247 select BLK_DEV_MD 248 select BLK_DEV_MD
248 ---help--- 249 ---help---
249 A dm target that supports RAID4, RAID5 and RAID6 mappings 250 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
250 251
251 A RAID-5 set of N drives with a capacity of C MB per drive provides 252 A RAID-5 set of N drives with a capacity of C MB per drive provides
252 the capacity of C * (N - 1) MB, and protects against a failure 253 the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 88143a0303d2..69873806c50c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -16,12 +16,10 @@
16#define DM_MSG_PREFIX "raid" 16#define DM_MSG_PREFIX "raid"
17 17
18/* 18/*
19 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then 19 * The following flags are used by dm-raid.c to set up the array state.
20 * make it so the flag doesn't set anything. 20 * They must be cleared before md_run is called.
21 */ 21 */
22#ifndef MD_SYNC_STATE_FORCED 22#define FirstUse 10 /* rdev flag */
23#define MD_SYNC_STATE_FORCED 0
24#endif
25 23
26struct raid_dev { 24struct raid_dev {
27 /* 25 /*
@@ -149,9 +147,16 @@ static void context_free(struct raid_set *rs)
149{ 147{
150 int i; 148 int i;
151 149
152 for (i = 0; i < rs->md.raid_disks; i++) 150 for (i = 0; i < rs->md.raid_disks; i++) {
151 if (rs->dev[i].meta_dev)
152 dm_put_device(rs->ti, rs->dev[i].meta_dev);
153 if (rs->dev[i].rdev.sb_page)
154 put_page(rs->dev[i].rdev.sb_page);
155 rs->dev[i].rdev.sb_page = NULL;
156 rs->dev[i].rdev.sb_loaded = 0;
153 if (rs->dev[i].data_dev) 157 if (rs->dev[i].data_dev)
154 dm_put_device(rs->ti, rs->dev[i].data_dev); 158 dm_put_device(rs->ti, rs->dev[i].data_dev);
159 }
155 160
156 kfree(rs); 161 kfree(rs);
157} 162}
@@ -161,7 +166,16 @@ static void context_free(struct raid_set *rs)
161 * <meta_dev>: meta device name or '-' if missing 166 * <meta_dev>: meta device name or '-' if missing
162 * <data_dev>: data device name or '-' if missing 167 * <data_dev>: data device name or '-' if missing
163 * 168 *
164 * This code parses those words. 169 * The following are permitted:
170 * - -
171 * - <data_dev>
172 * <meta_dev> <data_dev>
173 *
174 * The following is not allowed:
175 * <meta_dev> -
176 *
177 * This code parses those words. If there is a failure,
178 * the caller must use context_free to unwind the operations.
165 */ 179 */
166static int dev_parms(struct raid_set *rs, char **argv) 180static int dev_parms(struct raid_set *rs, char **argv)
167{ 181{
@@ -184,8 +198,16 @@ static int dev_parms(struct raid_set *rs, char **argv)
184 rs->dev[i].rdev.mddev = &rs->md; 198 rs->dev[i].rdev.mddev = &rs->md;
185 199
186 if (strcmp(argv[0], "-")) { 200 if (strcmp(argv[0], "-")) {
187 rs->ti->error = "Metadata devices not supported"; 201 ret = dm_get_device(rs->ti, argv[0],
188 return -EINVAL; 202 dm_table_get_mode(rs->ti->table),
203 &rs->dev[i].meta_dev);
204 rs->ti->error = "RAID metadata device lookup failure";
205 if (ret)
206 return ret;
207
208 rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
209 if (!rs->dev[i].rdev.sb_page)
210 return -ENOMEM;
189 } 211 }
190 212
191 if (!strcmp(argv[1], "-")) { 213 if (!strcmp(argv[1], "-")) {
@@ -195,6 +217,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
195 return -EINVAL; 217 return -EINVAL;
196 } 218 }
197 219
220 rs->ti->error = "No data device supplied with metadata device";
221 if (rs->dev[i].meta_dev)
222 return -EINVAL;
223
198 continue; 224 continue;
199 } 225 }
200 226
@@ -206,6 +232,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
206 return ret; 232 return ret;
207 } 233 }
208 234
235 if (rs->dev[i].meta_dev) {
236 metadata_available = 1;
237 rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
238 }
209 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; 239 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
210 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); 240 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
211 if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 241 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -334,22 +364,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
334 num_raid_params--; 364 num_raid_params--;
335 365
336 /* 366 /*
337 * Second, parse the unordered optional arguments 367 * We set each individual device as In_sync with a completed
368 * 'recovery_offset'. If there has been a device failure or
369 * replacement then one of the following cases applies:
370 *
371 * 1) User specifies 'rebuild'.
372 * - Device is reset when param is read.
373 * 2) A new device is supplied.
374 * - No matching superblock found, resets device.
375 * 3) Device failure was transient and returns on reload.
376 * - Failure noticed, resets device for bitmap replay.
377 * 4) Device hadn't completed recovery after previous failure.
378 * - Superblock is read and overrides recovery_offset.
379 *
380 * What is found in the superblocks of the devices is always
381 * authoritative, unless 'rebuild' or '[no]sync' was specified.
338 */ 382 */
339 for (i = 0; i < rs->md.raid_disks; i++) 383 for (i = 0; i < rs->md.raid_disks; i++) {
340 set_bit(In_sync, &rs->dev[i].rdev.flags); 384 set_bit(In_sync, &rs->dev[i].rdev.flags);
385 rs->dev[i].rdev.recovery_offset = MaxSector;
386 }
341 387
388 /*
389 * Second, parse the unordered optional arguments
390 */
342 for (i = 0; i < num_raid_params; i++) { 391 for (i = 0; i < num_raid_params; i++) {
343 if (!strcasecmp(argv[i], "nosync")) { 392 if (!strcasecmp(argv[i], "nosync")) {
344 rs->md.recovery_cp = MaxSector; 393 rs->md.recovery_cp = MaxSector;
345 rs->print_flags |= DMPF_NOSYNC; 394 rs->print_flags |= DMPF_NOSYNC;
346 rs->md.flags |= MD_SYNC_STATE_FORCED;
347 continue; 395 continue;
348 } 396 }
349 if (!strcasecmp(argv[i], "sync")) { 397 if (!strcasecmp(argv[i], "sync")) {
350 rs->md.recovery_cp = 0; 398 rs->md.recovery_cp = 0;
351 rs->print_flags |= DMPF_SYNC; 399 rs->print_flags |= DMPF_SYNC;
352 rs->md.flags |= MD_SYNC_STATE_FORCED;
353 continue; 400 continue;
354 } 401 }
355 402
@@ -482,13 +529,344 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
482} 529}
483 530
484/* 531/*
532 * This structure is never routinely used by userspace, unlike md superblocks.
533 * Devices with this superblock should only ever be accessed via device-mapper.
534 */
535#define DM_RAID_MAGIC 0x64526D44
536struct dm_raid_superblock {
537 __le32 magic; /* "DmRd" */
538 __le32 features; /* Used to indicate possible future changes */
539
540 __le32 num_devices; /* Number of devices in this array. (Max 64) */
541 __le32 array_position; /* The position of this drive in the array */
542
543 __le64 events; /* Incremented by md when superblock updated */
544 __le64 failed_devices; /* Bit field of devices to indicate failures */
545
546 /*
547 * This offset tracks the progress of the repair or replacement of
548 * an individual drive.
549 */
550 __le64 disk_recovery_offset;
551
552 /*
553 * This offset tracks the progress of the initial array
554 * synchronisation/parity calculation.
555 */
556 __le64 array_resync_offset;
557
558 /*
559 * RAID characteristics
560 */
561 __le32 level;
562 __le32 layout;
563 __le32 stripe_sectors;
564
565 __u8 pad[452]; /* Round struct to 512 bytes. */
566 /* Always set to 0 when writing. */
567} __packed;
568
569static int read_disk_sb(mdk_rdev_t *rdev, int size)
570{
571 BUG_ON(!rdev->sb_page);
572
573 if (rdev->sb_loaded)
574 return 0;
575
576 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
577 DMERR("Failed to read device superblock");
578 return -EINVAL;
579 }
580
581 rdev->sb_loaded = 1;
582
583 return 0;
584}
585
586static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
587{
588 mdk_rdev_t *r, *t;
589 uint64_t failed_devices;
590 struct dm_raid_superblock *sb;
591
592 sb = page_address(rdev->sb_page);
593 failed_devices = le64_to_cpu(sb->failed_devices);
594
595 rdev_for_each(r, t, mddev)
596 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
597 failed_devices |= (1ULL << r->raid_disk);
598
599 memset(sb, 0, sizeof(*sb));
600
601 sb->magic = cpu_to_le32(DM_RAID_MAGIC);
602 sb->features = cpu_to_le32(0); /* No features yet */
603
604 sb->num_devices = cpu_to_le32(mddev->raid_disks);
605 sb->array_position = cpu_to_le32(rdev->raid_disk);
606
607 sb->events = cpu_to_le64(mddev->events);
608 sb->failed_devices = cpu_to_le64(failed_devices);
609
610 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
611 sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
612
613 sb->level = cpu_to_le32(mddev->level);
614 sb->layout = cpu_to_le32(mddev->layout);
615 sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
616}
617
618/*
619 * super_load
620 *
621 * This function creates a superblock if one is not found on the device
622 * and will decide which superblock to use if there's a choice.
623 *
624 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
625 */
626static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
627{
628 int ret;
629 struct dm_raid_superblock *sb;
630 struct dm_raid_superblock *refsb;
631 uint64_t events_sb, events_refsb;
632
633 rdev->sb_start = 0;
634 rdev->sb_size = sizeof(*sb);
635
636 ret = read_disk_sb(rdev, rdev->sb_size);
637 if (ret)
638 return ret;
639
640 sb = page_address(rdev->sb_page);
641 if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
642 super_sync(rdev->mddev, rdev);
643
644 set_bit(FirstUse, &rdev->flags);
645
646 /* Force writing of superblocks to disk */
647 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
648
649 /* Any superblock is better than none, choose that if given */
650 return refdev ? 0 : 1;
651 }
652
653 if (!refdev)
654 return 1;
655
656 events_sb = le64_to_cpu(sb->events);
657
658 refsb = page_address(refdev->sb_page);
659 events_refsb = le64_to_cpu(refsb->events);
660
661 return (events_sb > events_refsb) ? 1 : 0;
662}
663
664static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
665{
666 int role;
667 struct raid_set *rs = container_of(mddev, struct raid_set, md);
668 uint64_t events_sb;
669 uint64_t failed_devices;
670 struct dm_raid_superblock *sb;
671 uint32_t new_devs = 0;
672 uint32_t rebuilds = 0;
673 mdk_rdev_t *r, *t;
674 struct dm_raid_superblock *sb2;
675
676 sb = page_address(rdev->sb_page);
677 events_sb = le64_to_cpu(sb->events);
678 failed_devices = le64_to_cpu(sb->failed_devices);
679
680 /*
681 * Initialise to 1 if this is a new superblock.
682 */
683 mddev->events = events_sb ? : 1;
684
685 /*
686 * Reshaping is not currently allowed
687 */
688 if ((le32_to_cpu(sb->level) != mddev->level) ||
689 (le32_to_cpu(sb->layout) != mddev->layout) ||
690 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
691 DMERR("Reshaping arrays not yet supported.");
692 return -EINVAL;
693 }
694
695 /* We can only change the number of devices in RAID1 right now */
696 if ((rs->raid_type->level != 1) &&
697 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
698 DMERR("Reshaping arrays not yet supported.");
699 return -EINVAL;
700 }
701
702 if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
703 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
704
705 /*
706 * During load, we set FirstUse if a new superblock was written.
707 * There are two reasons we might not have a superblock:
708 * 1) The array is brand new - in which case, all of the
709 * devices must have their In_sync bit set. Also,
710 * recovery_cp must be 0, unless forced.
711 * 2) This is a new device being added to an old array
712 * and the new device needs to be rebuilt - in which
713 * case the In_sync bit will /not/ be set and
714 * recovery_cp must be MaxSector.
715 */
716 rdev_for_each(r, t, mddev) {
717 if (!test_bit(In_sync, &r->flags)) {
718 if (!test_bit(FirstUse, &r->flags))
719 DMERR("Superblock area of "
720 "rebuild device %d should have been "
721 "cleared.", r->raid_disk);
722 set_bit(FirstUse, &r->flags);
723 rebuilds++;
724 } else if (test_bit(FirstUse, &r->flags))
725 new_devs++;
726 }
727
728 if (!rebuilds) {
729 if (new_devs == mddev->raid_disks) {
730 DMINFO("Superblocks created for new array");
731 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
732 } else if (new_devs) {
733 DMERR("New device injected "
734 "into existing array without 'rebuild' "
735 "parameter specified");
736 return -EINVAL;
737 }
738 } else if (new_devs) {
739 DMERR("'rebuild' devices cannot be "
740 "injected into an array with other first-time devices");
741 return -EINVAL;
742 } else if (mddev->recovery_cp != MaxSector) {
743 DMERR("'rebuild' specified while array is not in-sync");
744 return -EINVAL;
745 }
746
747 /*
748 * Now we set the Faulty bit for those devices that are
749 * recorded in the superblock as failed.
750 */
751 rdev_for_each(r, t, mddev) {
752 if (!r->sb_page)
753 continue;
754 sb2 = page_address(r->sb_page);
755 sb2->failed_devices = 0;
756
757 /*
758 * Check for any device re-ordering.
759 */
760 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
761 role = le32_to_cpu(sb2->array_position);
762 if (role != r->raid_disk) {
763 if (rs->raid_type->level != 1) {
764 rs->ti->error = "Cannot change device "
765 "positions in RAID array";
766 return -EINVAL;
767 }
768 DMINFO("RAID1 device #%d now at position #%d",
769 role, r->raid_disk);
770 }
771
772 /*
773 * Partial recovery is performed on
774 * returning failed devices.
775 */
776 if (failed_devices & (1 << role))
777 set_bit(Faulty, &r->flags);
778 }
779 }
780
781 return 0;
782}
783
784static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
785{
786 struct dm_raid_superblock *sb = page_address(rdev->sb_page);
787
788 /*
789 * If mddev->events is not set, we know we have not yet initialized
790 * the array.
791 */
792 if (!mddev->events && super_init_validation(mddev, rdev))
793 return -EINVAL;
794
795 mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
796 rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
797 if (!test_bit(FirstUse, &rdev->flags)) {
798 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
799 if (rdev->recovery_offset != MaxSector)
800 clear_bit(In_sync, &rdev->flags);
801 }
802
803 /*
804 * If a device comes back, set it as not In_sync and no longer faulty.
805 */
806 if (test_bit(Faulty, &rdev->flags)) {
807 clear_bit(Faulty, &rdev->flags);
808 clear_bit(In_sync, &rdev->flags);
809 rdev->saved_raid_disk = rdev->raid_disk;
810 rdev->recovery_offset = 0;
811 }
812
813 clear_bit(FirstUse, &rdev->flags);
814
815 return 0;
816}
817
818/*
819 * Analyse superblocks and select the freshest.
820 */
821static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
822{
823 int ret;
824 mdk_rdev_t *rdev, *freshest, *tmp;
825 mddev_t *mddev = &rs->md;
826
827 freshest = NULL;
828 rdev_for_each(rdev, tmp, mddev) {
829 if (!rdev->meta_bdev)
830 continue;
831
832 ret = super_load(rdev, freshest);
833
834 switch (ret) {
835 case 1:
836 freshest = rdev;
837 break;
838 case 0:
839 break;
840 default:
841 ti->error = "Failed to load superblock";
842 return ret;
843 }
844 }
845
846 if (!freshest)
847 return 0;
848
849 /*
850 * Validation of the freshest device provides the source of
851 * validation for the remaining devices.
852 */
853 ti->error = "Unable to assemble array: Invalid superblocks";
854 if (super_validate(mddev, freshest))
855 return -EINVAL;
856
857 rdev_for_each(rdev, tmp, mddev)
858 if ((rdev != freshest) && super_validate(mddev, rdev))
859 return -EINVAL;
860
861 return 0;
862}
863
864/*
485 * Construct a RAID4/5/6 mapping: 865 * Construct a RAID4/5/6 mapping:
486 * Args: 866 * Args:
487 * <raid_type> <#raid_params> <raid_params> \ 867 * <raid_type> <#raid_params> <raid_params> \
488 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } 868 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
489 * 869 *
490 * ** metadata devices are not supported yet, use '-' instead **
491 *
492 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for 870 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
493 * details on possible <raid_params>. 871 * details on possible <raid_params>.
494 */ 872 */
@@ -556,6 +934,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
556 if (ret) 934 if (ret)
557 goto bad; 935 goto bad;
558 936
937 rs->md.sync_super = super_sync;
938 ret = analyse_superblocks(ti, rs);
939 if (ret)
940 goto bad;
941
559 INIT_WORK(&rs->md.event_work, do_table_event); 942 INIT_WORK(&rs->md.event_work, do_table_event);
560 ti->private = rs; 943 ti->private = rs;
561 944
@@ -698,7 +1081,10 @@ static int raid_status(struct dm_target *ti, status_type_t type,
698 1081
699 DMEMIT(" %d", rs->md.raid_disks); 1082 DMEMIT(" %d", rs->md.raid_disks);
700 for (i = 0; i < rs->md.raid_disks; i++) { 1083 for (i = 0; i < rs->md.raid_disks; i++) {
701 DMEMIT(" -"); /* metadata device */ 1084 if (rs->dev[i].meta_dev)
1085 DMEMIT(" %s", rs->dev[i].meta_dev->name);
1086 else
1087 DMEMIT(" -");
702 1088
703 if (rs->dev[i].data_dev) 1089 if (rs->dev[i].data_dev)
704 DMEMIT(" %s", rs->dev[i].data_dev->name); 1090 DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -755,6 +1141,7 @@ static void raid_resume(struct dm_target *ti)
755{ 1141{
756 struct raid_set *rs = ti->private; 1142 struct raid_set *rs = ti->private;
757 1143
1144 bitmap_load(&rs->md);
758 mddev_resume(&rs->md); 1145 mddev_resume(&rs->md);
759} 1146}
760 1147