diff options
| author | Boaz Harrosh <bharrosh@panasas.com> | 2014-05-22 07:48:15 -0400 |
|---|---|---|
| committer | Boaz Harrosh <bharrosh@panasas.com> | 2014-05-22 07:48:15 -0400 |
| commit | ce5d36aac26cc395fe3bc45525cdbad3644f01e5 (patch) | |
| tree | 04aa541850b8282baf000657307c6504d16c52ab /fs/exofs | |
| parent | 455682ce547817d75e38028283dc8db00754005d (diff) | |
ore: Support for raid 6
This simple patch adds support for raid6 to the ORE.
Most operations and calculations where already for the general
case. Only things left:
* call async_gen_syndrome() in the case of raid6
(NOTE that the raid6 math is the one supported by the Linux Kernel
see: crypto/async_tx/async_pq.c)
* call _ore_add_parity_unit() twice with only last call generating
the redundancy pages.
* Fix couple BUGS in old code
a. In reads when parity==2 it can happen that per_dev->length=0
but per_dev->offset was set and adjusted by _ore_add_sg_seg().
Don't let it be overwritten.
b. The all 'cur_comp > starting_dev' thing to determine if:
"per_dev->offset is in the current stripe number or the
next one."
Was a complete raid5/4 accident. When parity==2 this is not
at all true usually. All we need to do is increment si->ob_offset
once we pass by the first parity device.
(This also greatly simplifies the code, amen)
c. Calculation of si->dev rotation can overflow when parity==2.
* Then last enable raid6 in ore_verify_layout()
I want to deeply thank Daniel Gryniewicz who found first all the
bugs in the old raid code, and inspired these patches:
Inspired-by Daniel Gryniewicz <dang@linuxbox.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs')
| -rw-r--r-- | fs/exofs/Kconfig.ore | 2 | ||||
| -rw-r--r-- | fs/exofs/ore.c | 75 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.c | 37 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.h | 3 |
4 files changed, 80 insertions, 37 deletions
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore index 1ca7fb7b6ba8..2daf2329c28d 100644 --- a/fs/exofs/Kconfig.ore +++ b/fs/exofs/Kconfig.ore | |||
| @@ -9,4 +9,6 @@ config ORE | |||
| 9 | tristate | 9 | tristate |
| 10 | depends on EXOFS_FS || PNFS_OBJLAYOUT | 10 | depends on EXOFS_FS || PNFS_OBJLAYOUT |
| 11 | select ASYNC_XOR | 11 | select ASYNC_XOR |
| 12 | select RAID6_PQ | ||
| 13 | select ASYNC_PQ | ||
| 12 | default SCSI_OSD_ULD | 14 | default SCSI_OSD_ULD |
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 0e2a8353f7cc..cfc0205d62c4 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
| @@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) | |||
| 58 | layout->parity = 1; | 58 | layout->parity = 1; |
| 59 | break; | 59 | break; |
| 60 | case PNFS_OSD_RAID_PQ: | 60 | case PNFS_OSD_RAID_PQ: |
| 61 | layout->parity = 2; | ||
| 62 | break; | ||
| 61 | case PNFS_OSD_RAID_4: | 63 | case PNFS_OSD_RAID_4: |
| 62 | default: | 64 | default: |
| 63 | ORE_ERR("Only RAID_0/5 for now\n"); | 65 | ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", |
| 66 | layout->raid_algorithm); | ||
| 64 | return -EINVAL; | 67 | return -EINVAL; |
| 65 | } | 68 | } |
| 66 | if (0 != (layout->stripe_unit & ~PAGE_MASK)) { | 69 | if (0 != (layout->stripe_unit & ~PAGE_MASK)) { |
| @@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) | |||
| 112 | layout->max_io_length /= stripe_length; | 115 | layout->max_io_length /= stripe_length; |
| 113 | layout->max_io_length *= stripe_length; | 116 | layout->max_io_length *= stripe_length; |
| 114 | } | 117 | } |
| 118 | ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); | ||
| 119 | |||
| 115 | return 0; | 120 | return 0; |
| 116 | } | 121 | } |
| 117 | EXPORT_SYMBOL(ore_verify_layout); | 122 | EXPORT_SYMBOL(ore_verify_layout); |
| @@ -561,7 +566,8 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, | |||
| 561 | 566 | ||
| 562 | si->par_dev = (group_width + group_width - parity - RxP) % | 567 | si->par_dev = (group_width + group_width - parity - RxP) % |
| 563 | group_width + first_dev; | 568 | group_width + first_dev; |
| 564 | si->dev = (group_width + C - RxP) % group_width + first_dev; | 569 | si->dev = (group_width + group_width + C - RxP) % |
| 570 | group_width + first_dev; | ||
| 565 | si->bytes_in_stripe = U; | 571 | si->bytes_in_stripe = U; |
| 566 | si->first_stripe_start = M * S + G * T + N * U; | 572 | si->first_stripe_start = M * S + G * T + N * U; |
| 567 | } else { | 573 | } else { |
| @@ -651,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance | |||
| 651 | return ret; | 657 | return ret; |
| 652 | } | 658 | } |
| 653 | 659 | ||
| 660 | static int _add_parity_units(struct ore_io_state *ios, | ||
| 661 | struct ore_striping_info *si, | ||
| 662 | unsigned dev, unsigned first_dev, | ||
| 663 | unsigned mirrors_p1, unsigned devs_in_group, | ||
| 664 | unsigned cur_len) | ||
| 665 | { | ||
| 666 | unsigned do_parity; | ||
| 667 | int ret = 0; | ||
| 668 | |||
| 669 | for (do_parity = ios->layout->parity; do_parity; --do_parity) { | ||
| 670 | struct ore_per_dev_state *per_dev; | ||
| 671 | |||
| 672 | per_dev = &ios->per_dev[dev - first_dev]; | ||
| 673 | if (!per_dev->length && !per_dev->offset) { | ||
| 674 | /* Only/always the parity unit of the first | ||
| 675 | * stripe will be empty. So this is a chance to | ||
| 676 | * initialize the per_dev info. | ||
| 677 | */ | ||
| 678 | per_dev->dev = dev; | ||
| 679 | per_dev->offset = si->obj_offset - si->unit_off; | ||
| 680 | } | ||
| 681 | |||
| 682 | ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, | ||
| 683 | do_parity == 1); | ||
| 684 | if (unlikely(ret)) | ||
| 685 | break; | ||
| 686 | |||
| 687 | if (do_parity != 1) { | ||
| 688 | dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; | ||
| 689 | si->cur_comp = (si->cur_comp + 1) % | ||
| 690 | ios->layout->group_width; | ||
| 691 | } | ||
| 692 | } | ||
| 693 | |||
| 694 | return ret; | ||
| 695 | } | ||
| 696 | |||
| 654 | static int _prepare_for_striping(struct ore_io_state *ios) | 697 | static int _prepare_for_striping(struct ore_io_state *ios) |
| 655 | { | 698 | { |
| 656 | struct ore_striping_info *si = &ios->si; | 699 | struct ore_striping_info *si = &ios->si; |
| @@ -660,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 660 | unsigned devs_in_group = group_width * mirrors_p1; | 703 | unsigned devs_in_group = group_width * mirrors_p1; |
| 661 | unsigned dev = si->dev; | 704 | unsigned dev = si->dev; |
| 662 | unsigned first_dev = dev - (dev % devs_in_group); | 705 | unsigned first_dev = dev - (dev % devs_in_group); |
| 663 | unsigned dev_order; | ||
| 664 | unsigned cur_pg = ios->pages_consumed; | 706 | unsigned cur_pg = ios->pages_consumed; |
| 665 | u64 length = ios->length; | 707 | u64 length = ios->length; |
| 666 | int ret = 0; | 708 | int ret = 0; |
| @@ -672,14 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 672 | 714 | ||
| 673 | BUG_ON(length > si->length); | 715 | BUG_ON(length > si->length); |
| 674 | 716 | ||
| 675 | dev_order = si->cur_comp; | ||
| 676 | |||
| 677 | while (length) { | 717 | while (length) { |
| 678 | struct ore_per_dev_state *per_dev = | 718 | struct ore_per_dev_state *per_dev = |
| 679 | &ios->per_dev[dev - first_dev]; | 719 | &ios->per_dev[dev - first_dev]; |
| 680 | unsigned cur_len, page_off = 0; | 720 | unsigned cur_len, page_off = 0; |
| 681 | 721 | ||
| 682 | if (!per_dev->length) { | 722 | if (!per_dev->length && !per_dev->offset) { |
| 723 | /* First time initialize the per_dev info. */ | ||
| 683 | per_dev->dev = dev; | 724 | per_dev->dev = dev; |
| 684 | if (dev == si->dev) { | 725 | if (dev == si->dev) { |
| 685 | WARN_ON(dev == si->par_dev); | 726 | WARN_ON(dev == si->par_dev); |
| @@ -688,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 688 | page_off = si->unit_off & ~PAGE_MASK; | 729 | page_off = si->unit_off & ~PAGE_MASK; |
| 689 | BUG_ON(page_off && (page_off != ios->pgbase)); | 730 | BUG_ON(page_off && (page_off != ios->pgbase)); |
| 690 | } else { | 731 | } else { |
| 691 | if (si->cur_comp > dev_order) | 732 | per_dev->offset = si->obj_offset - si->unit_off; |
| 692 | per_dev->offset = | ||
| 693 | si->obj_offset - si->unit_off; | ||
| 694 | else /* si->cur_comp < dev_order */ | ||
| 695 | per_dev->offset = | ||
| 696 | si->obj_offset + stripe_unit - | ||
| 697 | si->unit_off; | ||
| 698 | cur_len = stripe_unit; | 733 | cur_len = stripe_unit; |
| 699 | } | 734 | } |
| 700 | } else { | 735 | } else { |
| @@ -721,20 +756,12 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 721 | /* If last stripe operate on parity comp */ | 756 | /* If last stripe operate on parity comp */ |
| 722 | si->cur_comp = group_width - ios->layout->parity; | 757 | si->cur_comp = group_width - ios->layout->parity; |
| 723 | } | 758 | } |
| 724 | per_dev = &ios->per_dev[dev - first_dev]; | ||
| 725 | if (!per_dev->length) { | ||
| 726 | /* Only/always the parity unit of the first | ||
| 727 | * stripe will be empty. So this is a chance to | ||
| 728 | * initialize the per_dev info. | ||
| 729 | */ | ||
| 730 | per_dev->dev = dev; | ||
| 731 | per_dev->offset = si->obj_offset - si->unit_off; | ||
| 732 | } | ||
| 733 | 759 | ||
| 734 | /* In writes cur_len just means if it's the | 760 | /* In writes cur_len just means if it's the |
| 735 | * last one. See _ore_add_parity_unit. | 761 | * last one. See _ore_add_parity_unit. |
| 736 | */ | 762 | */ |
| 737 | ret = _ore_add_parity_unit(ios, si, per_dev, | 763 | ret = _add_parity_units(ios, si, dev, first_dev, |
| 764 | mirrors_p1, devs_in_group, | ||
| 738 | ios->sp2d ? length : cur_len); | 765 | ios->sp2d ? length : cur_len); |
| 739 | if (unlikely(ret)) | 766 | if (unlikely(ret)) |
| 740 | goto out; | 767 | goto out; |
| @@ -746,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) | |||
| 746 | /* Next stripe, start fresh */ | 773 | /* Next stripe, start fresh */ |
| 747 | si->cur_comp = 0; | 774 | si->cur_comp = 0; |
| 748 | si->cur_pg = 0; | 775 | si->cur_pg = 0; |
| 776 | si->obj_offset += cur_len; | ||
| 777 | si->unit_off = 0; | ||
| 749 | } | 778 | } |
| 750 | } | 779 | } |
| 751 | out: | 780 | out: |
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index d58a952e28bc..7f20f25c232c 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c | |||
| @@ -218,20 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) | |||
| 218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) | 218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) |
| 219 | { | 219 | { |
| 220 | unsigned p; | 220 | unsigned p; |
| 221 | unsigned tx_flags = ASYNC_TX_ACK; | ||
| 222 | |||
| 223 | if (sp2d->parity == 1) | ||
| 224 | tx_flags |= ASYNC_TX_XOR_ZERO_DST; | ||
| 225 | |||
| 221 | for (p = 0; p < sp2d->pages_in_unit; p++) { | 226 | for (p = 0; p < sp2d->pages_in_unit; p++) { |
| 222 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | 227 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; |
| 223 | 228 | ||
| 224 | if (!_1ps->write_count) | 229 | if (!_1ps->write_count) |
| 225 | continue; | 230 | continue; |
| 226 | 231 | ||
| 227 | init_async_submit(&_1ps->submit, | 232 | init_async_submit(&_1ps->submit, tx_flags, |
| 228 | ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, | ||
| 229 | NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); | 233 | NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); |
| 230 | 234 | ||
| 231 | /* TODO: raid6 */ | 235 | if (sp2d->parity == 1) |
| 232 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, | 236 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], |
| 233 | 0, sp2d->data_devs, PAGE_SIZE, | 237 | _1ps->pages, 0, sp2d->data_devs, |
| 234 | &_1ps->submit); | 238 | PAGE_SIZE, &_1ps->submit); |
| 239 | else /* parity == 2 */ | ||
| 240 | _1ps->tx = async_gen_syndrome(_1ps->pages, 0, | ||
| 241 | sp2d->data_devs + sp2d->parity, | ||
| 242 | PAGE_SIZE, &_1ps->submit); | ||
| 235 | } | 243 | } |
| 236 | 244 | ||
| 237 | for (p = 0; p < sp2d->pages_in_unit; p++) { | 245 | for (p = 0; p < sp2d->pages_in_unit; p++) { |
| @@ -616,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios) | |||
| 616 | int _ore_add_parity_unit(struct ore_io_state *ios, | 624 | int _ore_add_parity_unit(struct ore_io_state *ios, |
| 617 | struct ore_striping_info *si, | 625 | struct ore_striping_info *si, |
| 618 | struct ore_per_dev_state *per_dev, | 626 | struct ore_per_dev_state *per_dev, |
| 619 | unsigned cur_len) | 627 | unsigned cur_len, bool do_xor) |
| 620 | { | 628 | { |
| 621 | if (ios->reading) { | 629 | if (ios->reading) { |
| 622 | if (per_dev->cur_sg >= ios->sgs_per_dev) { | 630 | if (per_dev->cur_sg >= ios->sgs_per_dev) { |
| @@ -641,9 +649,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios, | |||
| 641 | /* If first stripe, Read in all read4write pages | 649 | /* If first stripe, Read in all read4write pages |
| 642 | * (if needed) before we calculate the first parity. | 650 | * (if needed) before we calculate the first parity. |
| 643 | */ | 651 | */ |
| 644 | _read_4_write_first_stripe(ios); | 652 | if (do_xor) |
| 653 | _read_4_write_first_stripe(ios); | ||
| 645 | } | 654 | } |
| 646 | if (!cur_len) /* If last stripe r4w pages of last stripe */ | 655 | if (!cur_len && do_xor) |
| 656 | /* If last stripe r4w pages of last stripe */ | ||
| 647 | _read_4_write_last_stripe(ios); | 657 | _read_4_write_last_stripe(ios); |
| 648 | _read_4_write_execute(ios); | 658 | _read_4_write_execute(ios); |
| 649 | 659 | ||
| @@ -655,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios, | |||
| 655 | ++(ios->cur_par_page); | 665 | ++(ios->cur_par_page); |
| 656 | } | 666 | } |
| 657 | 667 | ||
| 658 | BUG_ON(si->cur_comp != sp2d->data_devs); | 668 | BUG_ON(si->cur_comp < sp2d->data_devs); |
| 659 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); | 669 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); |
| 660 | 670 | ||
| 661 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | 671 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, |
| @@ -663,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios, | |||
| 663 | if (unlikely(ret)) | 673 | if (unlikely(ret)) |
| 664 | return ret; | 674 | return ret; |
| 665 | 675 | ||
| 666 | /* TODO: raid6 if (last_parity_dev) */ | 676 | if (do_xor) { |
| 667 | _gen_xor_unit(sp2d); | 677 | _gen_xor_unit(sp2d); |
| 668 | _sp2d_reset(sp2d, ios->r4w, ios->private); | 678 | _sp2d_reset(sp2d, ios->r4w, ios->private); |
| 679 | } | ||
| 669 | } | 680 | } |
| 670 | return 0; | 681 | return 0; |
| 671 | } | 682 | } |
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index d365bda6beef..cf6375d82129 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h | |||
| @@ -38,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios); | |||
| 38 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, | 38 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, |
| 39 | bool not_last); | 39 | bool not_last); |
| 40 | int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, | 40 | int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, |
| 41 | struct ore_per_dev_state *per_dev, unsigned cur_len); | 41 | struct ore_per_dev_state *per_dev, unsigned cur_len, |
| 42 | bool do_xor); | ||
| 42 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | 43 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, |
| 43 | struct ore_striping_info *si, struct page *page); | 44 | struct ore_striping_info *si, struct page *page); |
| 44 | static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, | 45 | static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, |
