aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-raid.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm-raid.c')
-rw-r--r--drivers/md/dm-raid.c419
1 files changed, 403 insertions, 16 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 88143a0303d2..69873806c50c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -16,12 +16,10 @@
16#define DM_MSG_PREFIX "raid" 16#define DM_MSG_PREFIX "raid"
17 17
18/* 18/*
19 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then 19 * The following flags are used by dm-raid.c to set up the array state.
20 * make it so the flag doesn't set anything. 20 * They must be cleared before md_run is called.
21 */ 21 */
22#ifndef MD_SYNC_STATE_FORCED 22#define FirstUse 10 /* rdev flag */
23#define MD_SYNC_STATE_FORCED 0
24#endif
25 23
26struct raid_dev { 24struct raid_dev {
27 /* 25 /*
@@ -149,9 +147,16 @@ static void context_free(struct raid_set *rs)
149{ 147{
150 int i; 148 int i;
151 149
152 for (i = 0; i < rs->md.raid_disks; i++) 150 for (i = 0; i < rs->md.raid_disks; i++) {
151 if (rs->dev[i].meta_dev)
152 dm_put_device(rs->ti, rs->dev[i].meta_dev);
153 if (rs->dev[i].rdev.sb_page)
154 put_page(rs->dev[i].rdev.sb_page);
155 rs->dev[i].rdev.sb_page = NULL;
156 rs->dev[i].rdev.sb_loaded = 0;
153 if (rs->dev[i].data_dev) 157 if (rs->dev[i].data_dev)
154 dm_put_device(rs->ti, rs->dev[i].data_dev); 158 dm_put_device(rs->ti, rs->dev[i].data_dev);
159 }
155 160
156 kfree(rs); 161 kfree(rs);
157} 162}
@@ -161,7 +166,16 @@ static void context_free(struct raid_set *rs)
161 * <meta_dev>: meta device name or '-' if missing 166 * <meta_dev>: meta device name or '-' if missing
162 * <data_dev>: data device name or '-' if missing 167 * <data_dev>: data device name or '-' if missing
163 * 168 *
164 * This code parses those words. 169 * The following are permitted:
170 * - -
171 * - <data_dev>
172 * <meta_dev> <data_dev>
173 *
174 * The following is not allowed:
175 * <meta_dev> -
176 *
177 * This code parses those words. If there is a failure,
178 * the caller must use context_free to unwind the operations.
165 */ 179 */
166static int dev_parms(struct raid_set *rs, char **argv) 180static int dev_parms(struct raid_set *rs, char **argv)
167{ 181{
@@ -184,8 +198,16 @@ static int dev_parms(struct raid_set *rs, char **argv)
184 rs->dev[i].rdev.mddev = &rs->md; 198 rs->dev[i].rdev.mddev = &rs->md;
185 199
186 if (strcmp(argv[0], "-")) { 200 if (strcmp(argv[0], "-")) {
187 rs->ti->error = "Metadata devices not supported"; 201 ret = dm_get_device(rs->ti, argv[0],
188 return -EINVAL; 202 dm_table_get_mode(rs->ti->table),
203 &rs->dev[i].meta_dev);
204 rs->ti->error = "RAID metadata device lookup failure";
205 if (ret)
206 return ret;
207
208 rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
209 if (!rs->dev[i].rdev.sb_page)
210 return -ENOMEM;
189 } 211 }
190 212
191 if (!strcmp(argv[1], "-")) { 213 if (!strcmp(argv[1], "-")) {
@@ -195,6 +217,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
195 return -EINVAL; 217 return -EINVAL;
196 } 218 }
197 219
220 rs->ti->error = "No data device supplied with metadata device";
221 if (rs->dev[i].meta_dev)
222 return -EINVAL;
223
198 continue; 224 continue;
199 } 225 }
200 226
@@ -206,6 +232,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
206 return ret; 232 return ret;
207 } 233 }
208 234
235 if (rs->dev[i].meta_dev) {
236 metadata_available = 1;
237 rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
238 }
209 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; 239 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
210 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); 240 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
211 if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 241 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -334,22 +364,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
334 num_raid_params--; 364 num_raid_params--;
335 365
336 /* 366 /*
337 * Second, parse the unordered optional arguments 367 * We set each individual device as In_sync with a completed
368 * 'recovery_offset'. If there has been a device failure or
369 * replacement then one of the following cases applies:
370 *
371 * 1) User specifies 'rebuild'.
372 * - Device is reset when param is read.
373 * 2) A new device is supplied.
374 * - No matching superblock found, resets device.
375 * 3) Device failure was transient and returns on reload.
376 * - Failure noticed, resets device for bitmap replay.
377 * 4) Device hadn't completed recovery after previous failure.
378 * - Superblock is read and overrides recovery_offset.
379 *
380 * What is found in the superblocks of the devices is always
381 * authoritative, unless 'rebuild' or '[no]sync' was specified.
338 */ 382 */
339 for (i = 0; i < rs->md.raid_disks; i++) 383 for (i = 0; i < rs->md.raid_disks; i++) {
340 set_bit(In_sync, &rs->dev[i].rdev.flags); 384 set_bit(In_sync, &rs->dev[i].rdev.flags);
385 rs->dev[i].rdev.recovery_offset = MaxSector;
386 }
341 387
388 /*
389 * Second, parse the unordered optional arguments
390 */
342 for (i = 0; i < num_raid_params; i++) { 391 for (i = 0; i < num_raid_params; i++) {
343 if (!strcasecmp(argv[i], "nosync")) { 392 if (!strcasecmp(argv[i], "nosync")) {
344 rs->md.recovery_cp = MaxSector; 393 rs->md.recovery_cp = MaxSector;
345 rs->print_flags |= DMPF_NOSYNC; 394 rs->print_flags |= DMPF_NOSYNC;
346 rs->md.flags |= MD_SYNC_STATE_FORCED;
347 continue; 395 continue;
348 } 396 }
349 if (!strcasecmp(argv[i], "sync")) { 397 if (!strcasecmp(argv[i], "sync")) {
350 rs->md.recovery_cp = 0; 398 rs->md.recovery_cp = 0;
351 rs->print_flags |= DMPF_SYNC; 399 rs->print_flags |= DMPF_SYNC;
352 rs->md.flags |= MD_SYNC_STATE_FORCED;
353 continue; 400 continue;
354 } 401 }
355 402
@@ -482,13 +529,344 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
482} 529}
483 530
484/* 531/*
532 * This structure is never routinely used by userspace, unlike md superblocks.
533 * Devices with this superblock should only ever be accessed via device-mapper.
534 */
535#define DM_RAID_MAGIC 0x64526D44
536struct dm_raid_superblock {
537 __le32 magic; /* "DmRd" */
538 __le32 features; /* Used to indicate possible future changes */
539
540 __le32 num_devices; /* Number of devices in this array. (Max 64) */
541 __le32 array_position; /* The position of this drive in the array */
542
543 __le64 events; /* Incremented by md when superblock updated */
544 __le64 failed_devices; /* Bit field of devices to indicate failures */
545
546 /*
547 * This offset tracks the progress of the repair or replacement of
548 * an individual drive.
549 */
550 __le64 disk_recovery_offset;
551
552 /*
553 * This offset tracks the progress of the initial array
554 * synchronisation/parity calculation.
555 */
556 __le64 array_resync_offset;
557
558 /*
559 * RAID characteristics
560 */
561 __le32 level;
562 __le32 layout;
563 __le32 stripe_sectors;
564
565 __u8 pad[452]; /* Round struct to 512 bytes. */
566 /* Always set to 0 when writing. */
567} __packed;
568
569static int read_disk_sb(mdk_rdev_t *rdev, int size)
570{
571 BUG_ON(!rdev->sb_page);
572
573 if (rdev->sb_loaded)
574 return 0;
575
576 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
577 DMERR("Failed to read device superblock");
578 return -EINVAL;
579 }
580
581 rdev->sb_loaded = 1;
582
583 return 0;
584}
585
586static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
587{
588 mdk_rdev_t *r, *t;
589 uint64_t failed_devices;
590 struct dm_raid_superblock *sb;
591
592 sb = page_address(rdev->sb_page);
593 failed_devices = le64_to_cpu(sb->failed_devices);
594
595 rdev_for_each(r, t, mddev)
596 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
597 failed_devices |= (1ULL << r->raid_disk);
598
599 memset(sb, 0, sizeof(*sb));
600
601 sb->magic = cpu_to_le32(DM_RAID_MAGIC);
602 sb->features = cpu_to_le32(0); /* No features yet */
603
604 sb->num_devices = cpu_to_le32(mddev->raid_disks);
605 sb->array_position = cpu_to_le32(rdev->raid_disk);
606
607 sb->events = cpu_to_le64(mddev->events);
608 sb->failed_devices = cpu_to_le64(failed_devices);
609
610 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
611 sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
612
613 sb->level = cpu_to_le32(mddev->level);
614 sb->layout = cpu_to_le32(mddev->layout);
615 sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
616}
617
618/*
619 * super_load
620 *
621 * This function creates a superblock if one is not found on the device
622 * and will decide which superblock to use if there's a choice.
623 *
624 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
625 */
626static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
627{
628 int ret;
629 struct dm_raid_superblock *sb;
630 struct dm_raid_superblock *refsb;
631 uint64_t events_sb, events_refsb;
632
633 rdev->sb_start = 0;
634 rdev->sb_size = sizeof(*sb);
635
636 ret = read_disk_sb(rdev, rdev->sb_size);
637 if (ret)
638 return ret;
639
640 sb = page_address(rdev->sb_page);
641 if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
642 super_sync(rdev->mddev, rdev);
643
644 set_bit(FirstUse, &rdev->flags);
645
646 /* Force writing of superblocks to disk */
647 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
648
649 /* Any superblock is better than none, choose that if given */
650 return refdev ? 0 : 1;
651 }
652
653 if (!refdev)
654 return 1;
655
656 events_sb = le64_to_cpu(sb->events);
657
658 refsb = page_address(refdev->sb_page);
659 events_refsb = le64_to_cpu(refsb->events);
660
661 return (events_sb > events_refsb) ? 1 : 0;
662}
663
664static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
665{
666 int role;
667 struct raid_set *rs = container_of(mddev, struct raid_set, md);
668 uint64_t events_sb;
669 uint64_t failed_devices;
670 struct dm_raid_superblock *sb;
671 uint32_t new_devs = 0;
672 uint32_t rebuilds = 0;
673 mdk_rdev_t *r, *t;
674 struct dm_raid_superblock *sb2;
675
676 sb = page_address(rdev->sb_page);
677 events_sb = le64_to_cpu(sb->events);
678 failed_devices = le64_to_cpu(sb->failed_devices);
679
680 /*
681 * Initialise to 1 if this is a new superblock.
682 */
683 mddev->events = events_sb ? : 1;
684
685 /*
686 * Reshaping is not currently allowed
687 */
688 if ((le32_to_cpu(sb->level) != mddev->level) ||
689 (le32_to_cpu(sb->layout) != mddev->layout) ||
690 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
691 DMERR("Reshaping arrays not yet supported.");
692 return -EINVAL;
693 }
694
695 /* We can only change the number of devices in RAID1 right now */
696 if ((rs->raid_type->level != 1) &&
697 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
698 DMERR("Reshaping arrays not yet supported.");
699 return -EINVAL;
700 }
701
702 if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
703 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
704
705 /*
706 * During load, we set FirstUse if a new superblock was written.
707 * There are two reasons we might not have a superblock:
708 * 1) The array is brand new - in which case, all of the
709 * devices must have their In_sync bit set. Also,
710 * recovery_cp must be 0, unless forced.
711 * 2) This is a new device being added to an old array
712 * and the new device needs to be rebuilt - in which
713 * case the In_sync bit will /not/ be set and
714 * recovery_cp must be MaxSector.
715 */
716 rdev_for_each(r, t, mddev) {
717 if (!test_bit(In_sync, &r->flags)) {
718 if (!test_bit(FirstUse, &r->flags))
719 DMERR("Superblock area of "
720 "rebuild device %d should have been "
721 "cleared.", r->raid_disk);
722 set_bit(FirstUse, &r->flags);
723 rebuilds++;
724 } else if (test_bit(FirstUse, &r->flags))
725 new_devs++;
726 }
727
728 if (!rebuilds) {
729 if (new_devs == mddev->raid_disks) {
730 DMINFO("Superblocks created for new array");
731 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
732 } else if (new_devs) {
733 DMERR("New device injected "
734 "into existing array without 'rebuild' "
735 "parameter specified");
736 return -EINVAL;
737 }
738 } else if (new_devs) {
739 DMERR("'rebuild' devices cannot be "
740 "injected into an array with other first-time devices");
741 return -EINVAL;
742 } else if (mddev->recovery_cp != MaxSector) {
743 DMERR("'rebuild' specified while array is not in-sync");
744 return -EINVAL;
745 }
746
747 /*
748 * Now we set the Faulty bit for those devices that are
749 * recorded in the superblock as failed.
750 */
751 rdev_for_each(r, t, mddev) {
752 if (!r->sb_page)
753 continue;
754 sb2 = page_address(r->sb_page);
755 sb2->failed_devices = 0;
756
757 /*
758 * Check for any device re-ordering.
759 */
760 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
761 role = le32_to_cpu(sb2->array_position);
762 if (role != r->raid_disk) {
763 if (rs->raid_type->level != 1) {
764 rs->ti->error = "Cannot change device "
765 "positions in RAID array";
766 return -EINVAL;
767 }
768 DMINFO("RAID1 device #%d now at position #%d",
769 role, r->raid_disk);
770 }
771
772 /*
773 * Partial recovery is performed on
774 * returning failed devices.
775 */
776 if (failed_devices & (1 << role))
777 set_bit(Faulty, &r->flags);
778 }
779 }
780
781 return 0;
782}
783
784static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
785{
786 struct dm_raid_superblock *sb = page_address(rdev->sb_page);
787
788 /*
789 * If mddev->events is not set, we know we have not yet initialized
790 * the array.
791 */
792 if (!mddev->events && super_init_validation(mddev, rdev))
793 return -EINVAL;
794
795 mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
796 rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
797 if (!test_bit(FirstUse, &rdev->flags)) {
798 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
799 if (rdev->recovery_offset != MaxSector)
800 clear_bit(In_sync, &rdev->flags);
801 }
802
803 /*
804 * If a device comes back, set it as not In_sync and no longer faulty.
805 */
806 if (test_bit(Faulty, &rdev->flags)) {
807 clear_bit(Faulty, &rdev->flags);
808 clear_bit(In_sync, &rdev->flags);
809 rdev->saved_raid_disk = rdev->raid_disk;
810 rdev->recovery_offset = 0;
811 }
812
813 clear_bit(FirstUse, &rdev->flags);
814
815 return 0;
816}
817
818/*
819 * Analyse superblocks and select the freshest.
820 */
821static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
822{
823 int ret;
824 mdk_rdev_t *rdev, *freshest, *tmp;
825 mddev_t *mddev = &rs->md;
826
827 freshest = NULL;
828 rdev_for_each(rdev, tmp, mddev) {
829 if (!rdev->meta_bdev)
830 continue;
831
832 ret = super_load(rdev, freshest);
833
834 switch (ret) {
835 case 1:
836 freshest = rdev;
837 break;
838 case 0:
839 break;
840 default:
841 ti->error = "Failed to load superblock";
842 return ret;
843 }
844 }
845
846 if (!freshest)
847 return 0;
848
849 /*
850 * Validation of the freshest device provides the source of
851 * validation for the remaining devices.
852 */
853 ti->error = "Unable to assemble array: Invalid superblocks";
854 if (super_validate(mddev, freshest))
855 return -EINVAL;
856
857 rdev_for_each(rdev, tmp, mddev)
858 if ((rdev != freshest) && super_validate(mddev, rdev))
859 return -EINVAL;
860
861 return 0;
862}
863
864/*
485 * Construct a RAID4/5/6 mapping: 865 * Construct a RAID4/5/6 mapping:
486 * Args: 866 * Args:
487 * <raid_type> <#raid_params> <raid_params> \ 867 * <raid_type> <#raid_params> <raid_params> \
488 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } 868 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
489 * 869 *
490 * ** metadata devices are not supported yet, use '-' instead **
491 *
492 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for 870 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
493 * details on possible <raid_params>. 871 * details on possible <raid_params>.
494 */ 872 */
@@ -556,6 +934,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
556 if (ret) 934 if (ret)
557 goto bad; 935 goto bad;
558 936
937 rs->md.sync_super = super_sync;
938 ret = analyse_superblocks(ti, rs);
939 if (ret)
940 goto bad;
941
559 INIT_WORK(&rs->md.event_work, do_table_event); 942 INIT_WORK(&rs->md.event_work, do_table_event);
560 ti->private = rs; 943 ti->private = rs;
561 944
@@ -698,7 +1081,10 @@ static int raid_status(struct dm_target *ti, status_type_t type,
698 1081
699 DMEMIT(" %d", rs->md.raid_disks); 1082 DMEMIT(" %d", rs->md.raid_disks);
700 for (i = 0; i < rs->md.raid_disks; i++) { 1083 for (i = 0; i < rs->md.raid_disks; i++) {
701 DMEMIT(" -"); /* metadata device */ 1084 if (rs->dev[i].meta_dev)
1085 DMEMIT(" %s", rs->dev[i].meta_dev->name);
1086 else
1087 DMEMIT(" -");
702 1088
703 if (rs->dev[i].data_dev) 1089 if (rs->dev[i].data_dev)
704 DMEMIT(" %s", rs->dev[i].data_dev->name); 1090 DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -755,6 +1141,7 @@ static void raid_resume(struct dm_target *ti)
755{ 1141{
756 struct raid_set *rs = ti->private; 1142 struct raid_set *rs = ti->private;
757 1143
1144 bitmap_load(&rs->md);
758 mddev_resume(&rs->md); 1145 mddev_resume(&rs->md);
759} 1146}
760 1147