summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5-ppl.c
diff options
context:
space:
mode:
authorArtur Paszkiewicz <artur.paszkiewicz@intel.com>2017-03-09 04:00:01 -0500
committerShaohua Li <shli@fb.com>2017-03-16 19:55:55 -0400
commit4536bf9ba2d03404655586b07f8830b6f2106242 (patch)
tree2b83774c1661296f1d5913c363342f5beaaf5eb9 /drivers/md/raid5-ppl.c
parent664aed04446c7f653d8acbe2cdf7989f28238524 (diff)
raid5-ppl: load and recover the log
Load the log from each disk when starting the array and recover if the array is dirty. The initial empty PPL is written by mdadm. When loading the log we verify the header checksum and signature. For external metadata arrays the signature is verified in userspace, so here we read it from the header, verifying only if it matches on all disks, and use it later when writing PPL. In addition to the header checksum, each header entry also contains a checksum of its partial parity data. If the header is valid, recovery is performed for each entry until an invalid entry is found. If the array is not degraded and recovery using PPL fully succeeds, there is no need to resync the array because data and parity will be consistent, so in this case resync will be disabled. Due to compatibility with IMSM implementations on other systems, we can't assume that the recovery data block size is always 4K. Writes generated by MD raid5 don't have this issue, but when recovering PPL written in other environments it is possible to have entries with 512-byte sector granularity. The recovery code takes this into account and also the logical sector size of the underlying drives. Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com> Signed-off-by: Shaohua Li <shli@fb.com>
Diffstat (limited to 'drivers/md/raid5-ppl.c')
-rw-r--r--drivers/md/raid5-ppl.c489
1 files changed, 489 insertions, 0 deletions
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index db5b72b11594..d336c024eef9 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -103,6 +103,10 @@ struct ppl_conf {
103 mempool_t *io_pool; 103 mempool_t *io_pool;
104 struct bio_set *bs; 104 struct bio_set *bs;
105 mempool_t *meta_pool; 105 mempool_t *meta_pool;
106
107 /* used only for recovery */
108 int recovered_entries;
109 int mismatch_count;
106}; 110};
107 111
108struct ppl_log { 112struct ppl_log {
@@ -514,6 +518,474 @@ void ppl_stripe_write_finished(struct stripe_head *sh)
514 ppl_io_unit_finished(io); 518 ppl_io_unit_finished(io);
515} 519}
516 520
521static void ppl_xor(int size, struct page *page1, struct page *page2)
522{
523 struct async_submit_ctl submit;
524 struct dma_async_tx_descriptor *tx;
525 struct page *xor_srcs[] = { page1, page2 };
526
527 init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
528 NULL, NULL, NULL, NULL);
529 tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
530
531 async_tx_quiesce(&tx);
532}
533
534/*
535 * PPL recovery strategy: xor partial parity and data from all modified data
536 * disks within a stripe and write the result as the new stripe parity. If all
537 * stripe data disks are modified (full stripe write), no partial parity is
538 * available, so just xor the data disks.
539 *
540 * Recovery of a PPL entry shall occur only if all modified data disks are
541 * available and read from all of them succeeds.
542 *
543 * A PPL entry applies to a stripe, partial parity size for an entry is at most
544 * the size of the chunk. Examples of possible cases for a single entry:
545 *
546 * case 0: single data disk write:
547 * data0 data1 data2 ppl parity
548 * +--------+--------+--------+ +--------------------+
549 * | ------ | ------ | ------ | +----+ | (no change) |
550 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
551 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
552 * | ------ | ------ | ------ | +----+ | (no change) |
553 * +--------+--------+--------+ +--------------------+
554 * pp_size = data_size
555 *
556 * case 1: more than one data disk write:
557 * data0 data1 data2 ppl parity
558 * +--------+--------+--------+ +--------------------+
559 * | ------ | ------ | ------ | +----+ | (no change) |
560 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
561 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
562 * | ------ | ------ | ------ | +----+ | (no change) |
563 * +--------+--------+--------+ +--------------------+
564 * pp_size = data_size / modified_data_disks
565 *
566 * case 2: write to all data disks (also full stripe write):
567 * data0 data1 data2 parity
568 * +--------+--------+--------+ +--------------------+
569 * | ------ | ------ | ------ | | (no change) |
570 * | -data- | -data- | -data- | --------> | xor all data |
571 * | ------ | ------ | ------ | --------> | (no change) |
572 * | ------ | ------ | ------ | | (no change) |
573 * +--------+--------+--------+ +--------------------+
574 * pp_size = 0
575 *
576 * The following cases are possible only in other implementations. The recovery
577 * code can handle them, but they are not generated at runtime because they can
578 * be reduced to cases 0, 1 and 2:
579 *
580 * case 3:
581 * data0 data1 data2 ppl parity
582 * +--------+--------+--------+ +----+ +--------------------+
583 * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp |
584 * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
585 * | -data- | -data- | -data- | | -- | -> | xor all data |
586 * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp |
587 * +--------+--------+--------+ +----+ +--------------------+
588 * pp_size = chunk_size
589 *
590 * case 4:
591 * data0 data1 data2 ppl parity
592 * +--------+--------+--------+ +----+ +--------------------+
593 * | ------ | -data- | ------ | | pp | | data1 ^ pp |
594 * | ------ | ------ | ------ | | -- | -> | (no change) |
595 * | ------ | ------ | ------ | | -- | -> | (no change) |
596 * | -data- | ------ | ------ | | pp | | data0 ^ pp |
597 * +--------+--------+--------+ +----+ +--------------------+
598 * pp_size = chunk_size
599 */
600static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
601 sector_t ppl_sector)
602{
603 struct ppl_conf *ppl_conf = log->ppl_conf;
604 struct mddev *mddev = ppl_conf->mddev;
605 struct r5conf *conf = mddev->private;
606 int block_size = ppl_conf->block_size;
607 struct page *page1;
608 struct page *page2;
609 sector_t r_sector_first;
610 sector_t r_sector_last;
611 int strip_sectors;
612 int data_disks;
613 int i;
614 int ret = 0;
615 char b[BDEVNAME_SIZE];
616 unsigned int pp_size = le32_to_cpu(e->pp_size);
617 unsigned int data_size = le32_to_cpu(e->data_size);
618
619 page1 = alloc_page(GFP_KERNEL);
620 page2 = alloc_page(GFP_KERNEL);
621
622 if (!page1 || !page2) {
623 ret = -ENOMEM;
624 goto out;
625 }
626
627 r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
628
629 if ((pp_size >> 9) < conf->chunk_sectors) {
630 if (pp_size > 0) {
631 data_disks = data_size / pp_size;
632 strip_sectors = pp_size >> 9;
633 } else {
634 data_disks = conf->raid_disks - conf->max_degraded;
635 strip_sectors = (data_size >> 9) / data_disks;
636 }
637 r_sector_last = r_sector_first +
638 (data_disks - 1) * conf->chunk_sectors +
639 strip_sectors;
640 } else {
641 data_disks = conf->raid_disks - conf->max_degraded;
642 strip_sectors = conf->chunk_sectors;
643 r_sector_last = r_sector_first + (data_size >> 9);
644 }
645
646 pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
647 (unsigned long long)r_sector_first,
648 (unsigned long long)r_sector_last);
649
650 /* if start and end is 4k aligned, use a 4k block */
651 if (block_size == 512 &&
652 (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
653 (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
654 block_size = STRIPE_SIZE;
655
656 /* iterate through blocks in strip */
657 for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
658 bool update_parity = false;
659 sector_t parity_sector;
660 struct md_rdev *parity_rdev;
661 struct stripe_head sh;
662 int disk;
663 int indent = 0;
664
665 pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
666 indent += 2;
667
668 memset(page_address(page1), 0, PAGE_SIZE);
669
670 /* iterate through data member disks */
671 for (disk = 0; disk < data_disks; disk++) {
672 int dd_idx;
673 struct md_rdev *rdev;
674 sector_t sector;
675 sector_t r_sector = r_sector_first + i +
676 (disk * conf->chunk_sectors);
677
678 pr_debug("%s:%*s data member disk %d start\n",
679 __func__, indent, "", disk);
680 indent += 2;
681
682 if (r_sector >= r_sector_last) {
683 pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
684 __func__, indent, "",
685 (unsigned long long)r_sector);
686 indent -= 2;
687 continue;
688 }
689
690 update_parity = true;
691
692 /* map raid sector to member disk */
693 sector = raid5_compute_sector(conf, r_sector, 0,
694 &dd_idx, NULL);
695 pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
696 __func__, indent, "",
697 (unsigned long long)r_sector, dd_idx,
698 (unsigned long long)sector);
699
700 rdev = conf->disks[dd_idx].rdev;
701 if (!rdev) {
702 pr_debug("%s:%*s data member disk %d missing\n",
703 __func__, indent, "", dd_idx);
704 update_parity = false;
705 break;
706 }
707
708 pr_debug("%s:%*s reading data member disk %s sector %llu\n",
709 __func__, indent, "", bdevname(rdev->bdev, b),
710 (unsigned long long)sector);
711 if (!sync_page_io(rdev, sector, block_size, page2,
712 REQ_OP_READ, 0, false)) {
713 md_error(mddev, rdev);
714 pr_debug("%s:%*s read failed!\n", __func__,
715 indent, "");
716 ret = -EIO;
717 goto out;
718 }
719
720 ppl_xor(block_size, page1, page2);
721
722 indent -= 2;
723 }
724
725 if (!update_parity)
726 continue;
727
728 if (pp_size > 0) {
729 pr_debug("%s:%*s reading pp disk sector %llu\n",
730 __func__, indent, "",
731 (unsigned long long)(ppl_sector + i));
732 if (!sync_page_io(log->rdev,
733 ppl_sector - log->rdev->data_offset + i,
734 block_size, page2, REQ_OP_READ, 0,
735 false)) {
736 pr_debug("%s:%*s read failed!\n", __func__,
737 indent, "");
738 md_error(mddev, log->rdev);
739 ret = -EIO;
740 goto out;
741 }
742
743 ppl_xor(block_size, page1, page2);
744 }
745
746 /* map raid sector to parity disk */
747 parity_sector = raid5_compute_sector(conf, r_sector_first + i,
748 0, &disk, &sh);
749 BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
750 parity_rdev = conf->disks[sh.pd_idx].rdev;
751
752 BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
753 pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
754 __func__, indent, "",
755 (unsigned long long)parity_sector,
756 bdevname(parity_rdev->bdev, b));
757 if (!sync_page_io(parity_rdev, parity_sector, block_size,
758 page1, REQ_OP_WRITE, 0, false)) {
759 pr_debug("%s:%*s parity write error!\n", __func__,
760 indent, "");
761 md_error(mddev, parity_rdev);
762 ret = -EIO;
763 goto out;
764 }
765 }
766out:
767 if (page1)
768 __free_page(page1);
769 if (page2)
770 __free_page(page2);
771 return ret;
772}
773
774static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
775{
776 struct ppl_conf *ppl_conf = log->ppl_conf;
777 struct md_rdev *rdev = log->rdev;
778 struct mddev *mddev = rdev->mddev;
779 sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
780 struct page *page;
781 int i;
782 int ret = 0;
783
784 page = alloc_page(GFP_KERNEL);
785 if (!page)
786 return -ENOMEM;
787
788 /* iterate through all PPL entries saved */
789 for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
790 struct ppl_header_entry *e = &pplhdr->entries[i];
791 u32 pp_size = le32_to_cpu(e->pp_size);
792 sector_t sector = ppl_sector;
793 int ppl_entry_sectors = pp_size >> 9;
794 u32 crc, crc_stored;
795
796 pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
797 __func__, rdev->raid_disk, i,
798 (unsigned long long)ppl_sector, pp_size);
799
800 crc = ~0;
801 crc_stored = le32_to_cpu(e->checksum);
802
803 /* read parial parity for this entry and calculate its checksum */
804 while (pp_size) {
805 int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
806
807 if (!sync_page_io(rdev, sector - rdev->data_offset,
808 s, page, REQ_OP_READ, 0, false)) {
809 md_error(mddev, rdev);
810 ret = -EIO;
811 goto out;
812 }
813
814 crc = crc32c_le(crc, page_address(page), s);
815
816 pp_size -= s;
817 sector += s >> 9;
818 }
819
820 crc = ~crc;
821
822 if (crc != crc_stored) {
823 /*
824 * Don't recover this entry if the checksum does not
825 * match, but keep going and try to recover other
826 * entries.
827 */
828 pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
829 __func__, crc_stored, crc);
830 ppl_conf->mismatch_count++;
831 } else {
832 ret = ppl_recover_entry(log, e, ppl_sector);
833 if (ret)
834 goto out;
835 ppl_conf->recovered_entries++;
836 }
837
838 ppl_sector += ppl_entry_sectors;
839 }
840
841 /* flush the disk cache after recovery if necessary */
842 ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
843out:
844 __free_page(page);
845 return ret;
846}
847
848static int ppl_write_empty_header(struct ppl_log *log)
849{
850 struct page *page;
851 struct ppl_header *pplhdr;
852 struct md_rdev *rdev = log->rdev;
853 int ret = 0;
854
855 pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
856 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
857
858 page = alloc_page(GFP_NOIO | __GFP_ZERO);
859 if (!page)
860 return -ENOMEM;
861
862 pplhdr = page_address(page);
863 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
864 pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
865 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
866
867 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
868 PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
869 false)) {
870 md_error(rdev->mddev, rdev);
871 ret = -EIO;
872 }
873
874 __free_page(page);
875 return ret;
876}
877
878static int ppl_load_distributed(struct ppl_log *log)
879{
880 struct ppl_conf *ppl_conf = log->ppl_conf;
881 struct md_rdev *rdev = log->rdev;
882 struct mddev *mddev = rdev->mddev;
883 struct page *page;
884 struct ppl_header *pplhdr;
885 u32 crc, crc_stored;
886 u32 signature;
887 int ret = 0;
888
889 pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
890
891 /* read PPL header */
892 page = alloc_page(GFP_KERNEL);
893 if (!page)
894 return -ENOMEM;
895
896 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
897 PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
898 md_error(mddev, rdev);
899 ret = -EIO;
900 goto out;
901 }
902 pplhdr = page_address(page);
903
904 /* check header validity */
905 crc_stored = le32_to_cpu(pplhdr->checksum);
906 pplhdr->checksum = 0;
907 crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
908
909 if (crc_stored != crc) {
910 pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
911 __func__, crc_stored, crc);
912 ppl_conf->mismatch_count++;
913 goto out;
914 }
915
916 signature = le32_to_cpu(pplhdr->signature);
917
918 if (mddev->external) {
919 /*
920 * For external metadata the header signature is set and
921 * validated in userspace.
922 */
923 ppl_conf->signature = signature;
924 } else if (ppl_conf->signature != signature) {
925 pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
926 __func__, signature, ppl_conf->signature);
927 ppl_conf->mismatch_count++;
928 goto out;
929 }
930
931 /* attempt to recover from log if we are starting a dirty array */
932 if (!mddev->pers && mddev->recovery_cp != MaxSector)
933 ret = ppl_recover(log, pplhdr);
934out:
935 /* write empty header if we are starting the array */
936 if (!ret && !mddev->pers)
937 ret = ppl_write_empty_header(log);
938
939 __free_page(page);
940
941 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
942 __func__, ret, ppl_conf->mismatch_count,
943 ppl_conf->recovered_entries);
944 return ret;
945}
946
947static int ppl_load(struct ppl_conf *ppl_conf)
948{
949 int ret = 0;
950 u32 signature = 0;
951 bool signature_set = false;
952 int i;
953
954 for (i = 0; i < ppl_conf->count; i++) {
955 struct ppl_log *log = &ppl_conf->child_logs[i];
956
957 /* skip missing drive */
958 if (!log->rdev)
959 continue;
960
961 ret = ppl_load_distributed(log);
962 if (ret)
963 break;
964
965 /*
966 * For external metadata we can't check if the signature is
967 * correct on a single drive, but we can check if it is the same
968 * on all drives.
969 */
970 if (ppl_conf->mddev->external) {
971 if (!signature_set) {
972 signature = ppl_conf->signature;
973 signature_set = true;
974 } else if (signature != ppl_conf->signature) {
975 pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
976 mdname(ppl_conf->mddev));
977 ret = -EINVAL;
978 break;
979 }
980 }
981 }
982
983 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
984 __func__, ret, ppl_conf->mismatch_count,
985 ppl_conf->recovered_entries);
986 return ret;
987}
988
517static void __ppl_exit_log(struct ppl_conf *ppl_conf) 989static void __ppl_exit_log(struct ppl_conf *ppl_conf)
518{ 990{
519 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); 991 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
@@ -694,6 +1166,23 @@ int ppl_init_log(struct r5conf *conf)
694 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n", 1166 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
695 mdname(mddev)); 1167 mdname(mddev));
696 1168
1169 /* load and possibly recover the logs from the member disks */
1170 ret = ppl_load(ppl_conf);
1171
1172 if (ret) {
1173 goto err;
1174 } else if (!mddev->pers &&
1175 mddev->recovery_cp == 0 && !mddev->degraded &&
1176 ppl_conf->recovered_entries > 0 &&
1177 ppl_conf->mismatch_count == 0) {
1178 /*
1179 * If we are starting a dirty array and the recovery succeeds
1180 * without any issues, set the array as clean.
1181 */
1182 mddev->recovery_cp = MaxSector;
1183 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
1184 }
1185
697 conf->log_private = ppl_conf; 1186 conf->log_private = ppl_conf;
698 1187
699 return 0; 1188 return 0;