diff options
author | Boaz Harrosh <bharrosh@panasas.com> | 2014-05-22 07:48:15 -0400 |
---|---|---|
committer | Boaz Harrosh <bharrosh@panasas.com> | 2014-05-22 07:48:15 -0400 |
commit | ce5d36aac26cc395fe3bc45525cdbad3644f01e5 (patch) | |
tree | 04aa541850b8282baf000657307c6504d16c52ab /fs/exofs | |
parent | 455682ce547817d75e38028283dc8db00754005d (diff) |
ore: Support for raid 6
This simple patch adds support for raid6 to the ORE.
Most operations and calculations where already for the general
case. Only things left:
* call async_gen_syndrome() in the case of raid6
(NOTE that the raid6 math is the one supported by the Linux Kernel
see: crypto/async_tx/async_pq.c)
* call _ore_add_parity_unit() twice with only last call generating
the redundancy pages.
* Fix couple BUGS in old code
a. In reads when parity==2 it can happen that per_dev->length=0
but per_dev->offset was set and adjusted by _ore_add_sg_seg().
Don't let it be overwritten.
b. The all 'cur_comp > starting_dev' thing to determine if:
"per_dev->offset is in the current stripe number or the
next one."
Was a complete raid5/4 accident. When parity==2 this is not
at all true usually. All we need to do is increment si->ob_offset
once we pass by the first parity device.
(This also greatly simplifies the code, amen)
c. Calculation of si->dev rotation can overflow when parity==2.
* Then last enable raid6 in ore_verify_layout()
I want to deeply thank Daniel Gryniewicz who found first all the
bugs in the old raid code, and inspired these patches:
Inspired-by Daniel Gryniewicz <dang@linuxbox.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs')
-rw-r--r-- | fs/exofs/Kconfig.ore | 2 | ||||
-rw-r--r-- | fs/exofs/ore.c | 75 | ||||
-rw-r--r-- | fs/exofs/ore_raid.c | 37 | ||||
-rw-r--r-- | fs/exofs/ore_raid.h | 3 |
4 files changed, 80 insertions, 37 deletions
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore index 1ca7fb7b6ba8..2daf2329c28d 100644 --- a/fs/exofs/Kconfig.ore +++ b/fs/exofs/Kconfig.ore | |||
@@ -9,4 +9,6 @@ config ORE | |||
9 | tristate | 9 | tristate |
10 | depends on EXOFS_FS || PNFS_OBJLAYOUT | 10 | depends on EXOFS_FS || PNFS_OBJLAYOUT |
11 | select ASYNC_XOR | 11 | select ASYNC_XOR |
12 | select RAID6_PQ | ||
13 | select ASYNC_PQ | ||
12 | default SCSI_OSD_ULD | 14 | default SCSI_OSD_ULD |
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 0e2a8353f7cc..cfc0205d62c4 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
@@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) | |||
58 | layout->parity = 1; | 58 | layout->parity = 1; |
59 | break; | 59 | break; |
60 | case PNFS_OSD_RAID_PQ: | 60 | case PNFS_OSD_RAID_PQ: |
61 | layout->parity = 2; | ||
62 | break; | ||
61 | case PNFS_OSD_RAID_4: | 63 | case PNFS_OSD_RAID_4: |
62 | default: | 64 | default: |
63 | ORE_ERR("Only RAID_0/5 for now\n"); | 65 | ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", |
66 | layout->raid_algorithm); | ||
64 | return -EINVAL; | 67 | return -EINVAL; |
65 | } | 68 | } |
66 | if (0 != (layout->stripe_unit & ~PAGE_MASK)) { | 69 | if (0 != (layout->stripe_unit & ~PAGE_MASK)) { |
@@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) | |||
112 | layout->max_io_length /= stripe_length; | 115 | layout->max_io_length /= stripe_length; |
113 | layout->max_io_length *= stripe_length; | 116 | layout->max_io_length *= stripe_length; |
114 | } | 117 | } |
118 | ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); | ||
119 | |||
115 | return 0; | 120 | return 0; |
116 | } | 121 | } |
117 | EXPORT_SYMBOL(ore_verify_layout); | 122 | EXPORT_SYMBOL(ore_verify_layout); |
@@ -561,7 +566,8 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | |||
561 | 566 | ||
562 | si->par_dev = (group_width + group_width - parity - RxP) % | 567 | si->par_dev = (group_width + group_width - parity - RxP) % |
563 | group_width + first_dev; | 568 | group_width + first_dev; |
564 | si->dev = (group_width + C - RxP) % group_width + first_dev; | 569 | si->dev = (group_width + group_width + C - RxP) % |
570 | group_width + first_dev; | ||
565 | si->bytes_in_stripe = U; | 571 | si->bytes_in_stripe = U; |
566 | si->first_stripe_start = M * S + G * T + N * U; | 572 | si->first_stripe_start = M * S + G * T + N * U; |
567 | } else { | 573 | } else { |
@@ -651,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance | |||
651 | return ret; | 657 | return ret; |
652 | } | 658 | } |
653 | 659 | ||
660 | static int _add_parity_units(struct ore_io_state *ios, | ||
661 | struct ore_striping_info *si, | ||
662 | unsigned dev, unsigned first_dev, | ||
663 | unsigned mirrors_p1, unsigned devs_in_group, | ||
664 | unsigned cur_len) | ||
665 | { | ||
666 | unsigned do_parity; | ||
667 | int ret = 0; | ||
668 | |||
669 | for (do_parity = ios->layout->parity; do_parity; --do_parity) { | ||
670 | struct ore_per_dev_state *per_dev; | ||
671 | |||
672 | per_dev = &ios->per_dev[dev - first_dev]; | ||
673 | if (!per_dev->length && !per_dev->offset) { | ||
674 | /* Only/always the parity unit of the first | ||
675 | * stripe will be empty. So this is a chance to | ||
676 | * initialize the per_dev info. | ||
677 | */ | ||
678 | per_dev->dev = dev; | ||
679 | per_dev->offset = si->obj_offset - si->unit_off; | ||
680 | } | ||
681 | |||
682 | ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, | ||
683 | do_parity == 1); | ||
684 | if (unlikely(ret)) | ||
685 | break; | ||
686 | |||
687 | if (do_parity != 1) { | ||
688 | dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; | ||
689 | si->cur_comp = (si->cur_comp + 1) % | ||
690 | ios->layout->group_width; | ||
691 | } | ||
692 | } | ||
693 | |||
694 | return ret; | ||
695 | } | ||
696 | |||
654 | static int _prepare_for_striping(struct ore_io_state *ios) | 697 | static int _prepare_for_striping(struct ore_io_state *ios) |
655 | { | 698 | { |
656 | struct ore_striping_info *si = &ios->si; | 699 | struct ore_striping_info *si = &ios->si; |
@@ -660,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
660 | unsigned devs_in_group = group_width * mirrors_p1; | 703 | unsigned devs_in_group = group_width * mirrors_p1; |
661 | unsigned dev = si->dev; | 704 | unsigned dev = si->dev; |
662 | unsigned first_dev = dev - (dev % devs_in_group); | 705 | unsigned first_dev = dev - (dev % devs_in_group); |
663 | unsigned dev_order; | ||
664 | unsigned cur_pg = ios->pages_consumed; | 706 | unsigned cur_pg = ios->pages_consumed; |
665 | u64 length = ios->length; | 707 | u64 length = ios->length; |
666 | int ret = 0; | 708 | int ret = 0; |
@@ -672,14 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
672 | 714 | ||
673 | BUG_ON(length > si->length); | 715 | BUG_ON(length > si->length); |
674 | 716 | ||
675 | dev_order = si->cur_comp; | ||
676 | |||
677 | while (length) { | 717 | while (length) { |
678 | struct ore_per_dev_state *per_dev = | 718 | struct ore_per_dev_state *per_dev = |
679 | &ios->per_dev[dev - first_dev]; | 719 | &ios->per_dev[dev - first_dev]; |
680 | unsigned cur_len, page_off = 0; | 720 | unsigned cur_len, page_off = 0; |
681 | 721 | ||
682 | if (!per_dev->length) { | 722 | if (!per_dev->length && !per_dev->offset) { |
723 | /* First time initialize the per_dev info. */ | ||
683 | per_dev->dev = dev; | 724 | per_dev->dev = dev; |
684 | if (dev == si->dev) { | 725 | if (dev == si->dev) { |
685 | WARN_ON(dev == si->par_dev); | 726 | WARN_ON(dev == si->par_dev); |
@@ -688,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
688 | page_off = si->unit_off & ~PAGE_MASK; | 729 | page_off = si->unit_off & ~PAGE_MASK; |
689 | BUG_ON(page_off && (page_off != ios->pgbase)); | 730 | BUG_ON(page_off && (page_off != ios->pgbase)); |
690 | } else { | 731 | } else { |
691 | if (si->cur_comp > dev_order) | 732 | per_dev->offset = si->obj_offset - si->unit_off; |
692 | per_dev->offset = | ||
693 | si->obj_offset - si->unit_off; | ||
694 | else /* si->cur_comp < dev_order */ | ||
695 | per_dev->offset = | ||
696 | si->obj_offset + stripe_unit - | ||
697 | si->unit_off; | ||
698 | cur_len = stripe_unit; | 733 | cur_len = stripe_unit; |
699 | } | 734 | } |
700 | } else { | 735 | } else { |
@@ -721,20 +756,12 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
721 | /* If last stripe operate on parity comp */ | 756 | /* If last stripe operate on parity comp */ |
722 | si->cur_comp = group_width - ios->layout->parity; | 757 | si->cur_comp = group_width - ios->layout->parity; |
723 | } | 758 | } |
724 | per_dev = &ios->per_dev[dev - first_dev]; | ||
725 | if (!per_dev->length) { | ||
726 | /* Only/always the parity unit of the first | ||
727 | * stripe will be empty. So this is a chance to | ||
728 | * initialize the per_dev info. | ||
729 | */ | ||
730 | per_dev->dev = dev; | ||
731 | per_dev->offset = si->obj_offset - si->unit_off; | ||
732 | } | ||
733 | 759 | ||
734 | /* In writes cur_len just means if it's the | 760 | /* In writes cur_len just means if it's the |
735 | * last one. See _ore_add_parity_unit. | 761 | * last one. See _ore_add_parity_unit. |
736 | */ | 762 | */ |
737 | ret = _ore_add_parity_unit(ios, si, per_dev, | 763 | ret = _add_parity_units(ios, si, dev, first_dev, |
764 | mirrors_p1, devs_in_group, | ||
738 | ios->sp2d ? length : cur_len); | 765 | ios->sp2d ? length : cur_len); |
739 | if (unlikely(ret)) | 766 | if (unlikely(ret)) |
740 | goto out; | 767 | goto out; |
@@ -746,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
746 | /* Next stripe, start fresh */ | 773 | /* Next stripe, start fresh */ |
747 | si->cur_comp = 0; | 774 | si->cur_comp = 0; |
748 | si->cur_pg = 0; | 775 | si->cur_pg = 0; |
776 | si->obj_offset += cur_len; | ||
777 | si->unit_off = 0; | ||
749 | } | 778 | } |
750 | } | 779 | } |
751 | out: | 780 | out: |
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index d58a952e28bc..7f20f25c232c 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c | |||
@@ -218,20 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) | |||
218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) | 218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) |
219 | { | 219 | { |
220 | unsigned p; | 220 | unsigned p; |
221 | unsigned tx_flags = ASYNC_TX_ACK; | ||
222 | |||
223 | if (sp2d->parity == 1) | ||
224 | tx_flags |= ASYNC_TX_XOR_ZERO_DST; | ||
225 | |||
221 | for (p = 0; p < sp2d->pages_in_unit; p++) { | 226 | for (p = 0; p < sp2d->pages_in_unit; p++) { |
222 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | 227 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; |
223 | 228 | ||
224 | if (!_1ps->write_count) | 229 | if (!_1ps->write_count) |
225 | continue; | 230 | continue; |
226 | 231 | ||
227 | init_async_submit(&_1ps->submit, | 232 | init_async_submit(&_1ps->submit, tx_flags, |
228 | ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, | ||
229 | NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); | 233 | NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); |
230 | 234 | ||
231 | /* TODO: raid6 */ | 235 | if (sp2d->parity == 1) |
232 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, | 236 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], |
233 | 0, sp2d->data_devs, PAGE_SIZE, | 237 | _1ps->pages, 0, sp2d->data_devs, |
234 | &_1ps->submit); | 238 | PAGE_SIZE, &_1ps->submit); |
239 | else /* parity == 2 */ | ||
240 | _1ps->tx = async_gen_syndrome(_1ps->pages, 0, | ||
241 | sp2d->data_devs + sp2d->parity, | ||
242 | PAGE_SIZE, &_1ps->submit); | ||
235 | } | 243 | } |
236 | 244 | ||
237 | for (p = 0; p < sp2d->pages_in_unit; p++) { | 245 | for (p = 0; p < sp2d->pages_in_unit; p++) { |
@@ -616,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios) | |||
616 | int _ore_add_parity_unit(struct ore_io_state *ios, | 624 | int _ore_add_parity_unit(struct ore_io_state *ios, |
617 | struct ore_striping_info *si, | 625 | struct ore_striping_info *si, |
618 | struct ore_per_dev_state *per_dev, | 626 | struct ore_per_dev_state *per_dev, |
619 | unsigned cur_len) | 627 | unsigned cur_len, bool do_xor) |
620 | { | 628 | { |
621 | if (ios->reading) { | 629 | if (ios->reading) { |
622 | if (per_dev->cur_sg >= ios->sgs_per_dev) { | 630 | if (per_dev->cur_sg >= ios->sgs_per_dev) { |
@@ -641,9 +649,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios, | |||
641 | /* If first stripe, Read in all read4write pages | 649 | /* If first stripe, Read in all read4write pages |
642 | * (if needed) before we calculate the first parity. | 650 | * (if needed) before we calculate the first parity. |
643 | */ | 651 | */ |
644 | _read_4_write_first_stripe(ios); | 652 | if (do_xor) |
653 | _read_4_write_first_stripe(ios); | ||
645 | } | 654 | } |
646 | if (!cur_len) /* If last stripe r4w pages of last stripe */ | 655 | if (!cur_len && do_xor) |
656 | /* If last stripe r4w pages of last stripe */ | ||
647 | _read_4_write_last_stripe(ios); | 657 | _read_4_write_last_stripe(ios); |
648 | _read_4_write_execute(ios); | 658 | _read_4_write_execute(ios); |
649 | 659 | ||
@@ -655,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios, | |||
655 | ++(ios->cur_par_page); | 665 | ++(ios->cur_par_page); |
656 | } | 666 | } |
657 | 667 | ||
658 | BUG_ON(si->cur_comp != sp2d->data_devs); | 668 | BUG_ON(si->cur_comp < sp2d->data_devs); |
659 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); | 669 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); |
660 | 670 | ||
661 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | 671 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, |
@@ -663,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios, | |||
663 | if (unlikely(ret)) | 673 | if (unlikely(ret)) |
664 | return ret; | 674 | return ret; |
665 | 675 | ||
666 | /* TODO: raid6 if (last_parity_dev) */ | 676 | if (do_xor) { |
667 | _gen_xor_unit(sp2d); | 677 | _gen_xor_unit(sp2d); |
668 | _sp2d_reset(sp2d, ios->r4w, ios->private); | 678 | _sp2d_reset(sp2d, ios->r4w, ios->private); |
679 | } | ||
669 | } | 680 | } |
670 | return 0; | 681 | return 0; |
671 | } | 682 | } |
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index d365bda6beef..cf6375d82129 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h | |||
@@ -38,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios); | |||
38 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | 38 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, |
39 | bool not_last); | 39 | bool not_last); |
40 | int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, | 40 | int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, |
41 | struct ore_per_dev_state *per_dev, unsigned cur_len); | 41 | struct ore_per_dev_state *per_dev, unsigned cur_len, |
42 | bool do_xor); | ||
42 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | 43 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, |
43 | struct ore_striping_info *si, struct page *page); | 44 | struct ore_striping_info *si, struct page *page); |
44 | static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, | 45 | static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, |