aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-07 20:07:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-07 20:07:20 -0400
commit9d2cd01b15d0782adb81e40094b67904d77b03df (patch)
treef8091fcd05f463a0b31485cfe3edcef0d0211da9
parent57d326169e878a1a37b2bccd1cf81f6809ee67b9 (diff)
parentce5d36aac26cc395fe3bc45525cdbad3644f01e5 (diff)
Merge branch 'for-linus' of git://git.open-osd.org/linux-open-osd into next
Pull exofs raid6 support from Boaz Harrosh: "These simple patches will enable raid6 using the kernel's raid6_pq engine for support under exofs and pnfs-objects. There is nothing needed to do at exofs and pnfs-obj. Just fire your mkfs.exofs with --raid=6 (that was already supported before) and off you go as usual. The ORE will pick up the new map and will start writing two devices of redundancy bits. The patches are so simple because most of the ORE was already for the general raid case, only a few bug fixes were needed and the actual wiring into the raid6_pq engine" * 'for-linus' of git://git.open-osd.org/linux-open-osd: ore: Support for raid 6 ore: Remove redundant dev_order(), more cleanups ore: (trivial) reformat some code
-rw-r--r--fs/exofs/Kconfig.ore2
-rw-r--r--fs/exofs/ore.c100
-rw-r--r--fs/exofs/ore_raid.c56
-rw-r--r--fs/exofs/ore_raid.h21
4 files changed, 98 insertions, 81 deletions
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
index 1ca7fb7b6ba8..2daf2329c28d 100644
--- a/fs/exofs/Kconfig.ore
+++ b/fs/exofs/Kconfig.ore
@@ -9,4 +9,6 @@ config ORE
9 tristate 9 tristate
10 depends on EXOFS_FS || PNFS_OBJLAYOUT 10 depends on EXOFS_FS || PNFS_OBJLAYOUT
11 select ASYNC_XOR 11 select ASYNC_XOR
12 select RAID6_PQ
13 select ASYNC_PQ
12 default SCSI_OSD_ULD 14 default SCSI_OSD_ULD
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index dae884694bd9..cfc0205d62c4 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
58 layout->parity = 1; 58 layout->parity = 1;
59 break; 59 break;
60 case PNFS_OSD_RAID_PQ: 60 case PNFS_OSD_RAID_PQ:
61 layout->parity = 2;
62 break;
61 case PNFS_OSD_RAID_4: 63 case PNFS_OSD_RAID_4:
62 default: 64 default:
63 ORE_ERR("Only RAID_0/5 for now\n"); 65 ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
66 layout->raid_algorithm);
64 return -EINVAL; 67 return -EINVAL;
65 } 68 }
66 if (0 != (layout->stripe_unit & ~PAGE_MASK)) { 69 if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
@@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
112 layout->max_io_length /= stripe_length; 115 layout->max_io_length /= stripe_length;
113 layout->max_io_length *= stripe_length; 116 layout->max_io_length *= stripe_length;
114 } 117 }
118 ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
119
115 return 0; 120 return 0;
116} 121}
117EXPORT_SYMBOL(ore_verify_layout); 122EXPORT_SYMBOL(ore_verify_layout);
@@ -545,21 +550,24 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
545 550
546 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 551 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
547 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; 552 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
553 u32 first_dev = C - C % group_width;
548 554
549 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 555 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
550 556
551 si->obj_offset = si->unit_off + (N * stripe_unit) + 557 si->obj_offset = si->unit_off + (N * stripe_unit) +
552 (M * group_depth * stripe_unit); 558 (M * group_depth * stripe_unit);
559 si->cur_comp = C - first_dev;
560 si->cur_pg = si->unit_off / PAGE_SIZE;
553 561
554 if (parity) { 562 if (parity) {
555 u32 LCMdP = lcm(group_width, parity) / parity; 563 u32 LCMdP = lcm(group_width, parity) / parity;
556 /* R = N % LCMdP; */ 564 /* R = N % LCMdP; */
557 u32 RxP = (N % LCMdP) * parity; 565 u32 RxP = (N % LCMdP) * parity;
558 u32 first_dev = C - C % group_width;
559 566
560 si->par_dev = (group_width + group_width - parity - RxP) % 567 si->par_dev = (group_width + group_width - parity - RxP) %
561 group_width + first_dev; 568 group_width + first_dev;
562 si->dev = (group_width + C - RxP) % group_width + first_dev; 569 si->dev = (group_width + group_width + C - RxP) %
570 group_width + first_dev;
563 si->bytes_in_stripe = U; 571 si->bytes_in_stripe = U;
564 si->first_stripe_start = M * S + G * T + N * U; 572 si->first_stripe_start = M * S + G * T + N * U;
565 } else { 573 } else {
@@ -649,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance
649 return ret; 657 return ret;
650} 658}
651 659
660static int _add_parity_units(struct ore_io_state *ios,
661 struct ore_striping_info *si,
662 unsigned dev, unsigned first_dev,
663 unsigned mirrors_p1, unsigned devs_in_group,
664 unsigned cur_len)
665{
666 unsigned do_parity;
667 int ret = 0;
668
669 for (do_parity = ios->layout->parity; do_parity; --do_parity) {
670 struct ore_per_dev_state *per_dev;
671
672 per_dev = &ios->per_dev[dev - first_dev];
673 if (!per_dev->length && !per_dev->offset) {
674 /* Only/always the parity unit of the first
675 * stripe will be empty. So this is a chance to
676 * initialize the per_dev info.
677 */
678 per_dev->dev = dev;
679 per_dev->offset = si->obj_offset - si->unit_off;
680 }
681
682 ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
683 do_parity == 1);
684 if (unlikely(ret))
685 break;
686
687 if (do_parity != 1) {
688 dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
689 si->cur_comp = (si->cur_comp + 1) %
690 ios->layout->group_width;
691 }
692 }
693
694 return ret;
695}
696
652static int _prepare_for_striping(struct ore_io_state *ios) 697static int _prepare_for_striping(struct ore_io_state *ios)
653{ 698{
654 struct ore_striping_info *si = &ios->si; 699 struct ore_striping_info *si = &ios->si;
@@ -658,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios)
658 unsigned devs_in_group = group_width * mirrors_p1; 703 unsigned devs_in_group = group_width * mirrors_p1;
659 unsigned dev = si->dev; 704 unsigned dev = si->dev;
660 unsigned first_dev = dev - (dev % devs_in_group); 705 unsigned first_dev = dev - (dev % devs_in_group);
661 unsigned dev_order;
662 unsigned cur_pg = ios->pages_consumed; 706 unsigned cur_pg = ios->pages_consumed;
663 u64 length = ios->length; 707 u64 length = ios->length;
664 int ret = 0; 708 int ret = 0;
@@ -670,16 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios)
670 714
671 BUG_ON(length > si->length); 715 BUG_ON(length > si->length);
672 716
673 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
674 si->cur_comp = dev_order;
675 si->cur_pg = si->unit_off / PAGE_SIZE;
676
677 while (length) { 717 while (length) {
678 unsigned comp = dev - first_dev; 718 struct ore_per_dev_state *per_dev =
679 struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; 719 &ios->per_dev[dev - first_dev];
680 unsigned cur_len, page_off = 0; 720 unsigned cur_len, page_off = 0;
681 721
682 if (!per_dev->length) { 722 if (!per_dev->length && !per_dev->offset) {
723 /* First time initialize the per_dev info. */
683 per_dev->dev = dev; 724 per_dev->dev = dev;
684 if (dev == si->dev) { 725 if (dev == si->dev) {
685 WARN_ON(dev == si->par_dev); 726 WARN_ON(dev == si->par_dev);
@@ -688,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
688 page_off = si->unit_off & ~PAGE_MASK; 729 page_off = si->unit_off & ~PAGE_MASK;
689 BUG_ON(page_off && (page_off != ios->pgbase)); 730 BUG_ON(page_off && (page_off != ios->pgbase));
690 } else { 731 } else {
691 if (si->cur_comp > dev_order) 732 per_dev->offset = si->obj_offset - si->unit_off;
692 per_dev->offset =
693 si->obj_offset - si->unit_off;
694 else /* si->cur_comp < dev_order */
695 per_dev->offset =
696 si->obj_offset + stripe_unit -
697 si->unit_off;
698 cur_len = stripe_unit; 733 cur_len = stripe_unit;
699 } 734 }
700 } else { 735 } else {
@@ -708,11 +743,9 @@ static int _prepare_for_striping(struct ore_io_state *ios)
708 if (unlikely(ret)) 743 if (unlikely(ret))
709 goto out; 744 goto out;
710 745
711 dev += mirrors_p1;
712 dev = (dev % devs_in_group) + first_dev;
713
714 length -= cur_len; 746 length -= cur_len;
715 747
748 dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
716 si->cur_comp = (si->cur_comp + 1) % group_width; 749 si->cur_comp = (si->cur_comp + 1) % group_width;
717 if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { 750 if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
718 if (!length && ios->sp2d) { 751 if (!length && ios->sp2d) {
@@ -720,23 +753,16 @@ static int _prepare_for_striping(struct ore_io_state *ios)
720 * stripe. then operate on parity dev. 753 * stripe. then operate on parity dev.
721 */ 754 */
722 dev = si->par_dev; 755 dev = si->par_dev;
723 } 756 /* If last stripe operate on parity comp */
724 if (ios->sp2d) 757 si->cur_comp = group_width - ios->layout->parity;
725 /* In writes cur_len just means if it's the
726 * last one. See _ore_add_parity_unit.
727 */
728 cur_len = length;
729 per_dev = &ios->per_dev[dev - first_dev];
730 if (!per_dev->length) {
731 /* Only/always the parity unit of the first
732 * stripe will be empty. So this is a chance to
733 * initialize the per_dev info.
734 */
735 per_dev->dev = dev;
736 per_dev->offset = si->obj_offset - si->unit_off;
737 } 758 }
738 759
739 ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); 760 /* In writes cur_len just means if it's the
761 * last one. See _ore_add_parity_unit.
762 */
763 ret = _add_parity_units(ios, si, dev, first_dev,
764 mirrors_p1, devs_in_group,
765 ios->sp2d ? length : cur_len);
740 if (unlikely(ret)) 766 if (unlikely(ret))
741 goto out; 767 goto out;
742 768
@@ -747,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios)
747 /* Next stripe, start fresh */ 773 /* Next stripe, start fresh */
748 si->cur_comp = 0; 774 si->cur_comp = 0;
749 si->cur_pg = 0; 775 si->cur_pg = 0;
776 si->obj_offset += cur_len;
777 si->unit_off = 0;
750 } 778 }
751 } 779 }
752out: 780out:
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 4e2c032ab8a1..7f20f25c232c 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -218,22 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
218static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) 218static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
219{ 219{
220 unsigned p; 220 unsigned p;
221 unsigned tx_flags = ASYNC_TX_ACK;
222
223 if (sp2d->parity == 1)
224 tx_flags |= ASYNC_TX_XOR_ZERO_DST;
225
221 for (p = 0; p < sp2d->pages_in_unit; p++) { 226 for (p = 0; p < sp2d->pages_in_unit; p++) {
222 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 227 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
223 228
224 if (!_1ps->write_count) 229 if (!_1ps->write_count)
225 continue; 230 continue;
226 231
227 init_async_submit(&_1ps->submit, 232 init_async_submit(&_1ps->submit, tx_flags,
228 ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, 233 NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
229 NULL, 234
230 NULL, NULL, 235 if (sp2d->parity == 1)
231 (addr_conv_t *)_1ps->scribble); 236 _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
232 237 _1ps->pages, 0, sp2d->data_devs,
233 /* TODO: raid6 */ 238 PAGE_SIZE, &_1ps->submit);
234 _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, 239 else /* parity == 2 */
235 0, sp2d->data_devs, PAGE_SIZE, 240 _1ps->tx = async_gen_syndrome(_1ps->pages, 0,
236 &_1ps->submit); 241 sp2d->data_devs + sp2d->parity,
242 PAGE_SIZE, &_1ps->submit);
237 } 243 }
238 244
239 for (p = 0; p < sp2d->pages_in_unit; p++) { 245 for (p = 0; p < sp2d->pages_in_unit; p++) {
@@ -404,9 +410,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
404 410
405 ore_calc_stripe_info(ios->layout, *offset, 0, &si); 411 ore_calc_stripe_info(ios->layout, *offset, 0, &si);
406 412
407 p = si.unit_off / PAGE_SIZE; 413 p = si.cur_pg;
408 c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, 414 c = si.cur_comp;
409 ios->layout->mirrors_p1, si.par_dev, si.dev);
410 page = ios->sp2d->_1p_stripes[p].pages[c]; 415 page = ios->sp2d->_1p_stripes[p].pages[c];
411 416
412 pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); 417 pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
@@ -534,9 +539,8 @@ static int _read_4_write_last_stripe(struct ore_io_state *ios)
534 goto read_it; 539 goto read_it;
535 540
536 ore_calc_stripe_info(ios->layout, offset, 0, &read_si); 541 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
537 p = read_si.unit_off / PAGE_SIZE; 542 p = read_si.cur_pg;
538 c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, 543 c = read_si.cur_comp;
539 ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
540 544
541 if (min_p == sp2d->pages_in_unit) { 545 if (min_p == sp2d->pages_in_unit) {
542 /* Didn't do it yet */ 546 /* Didn't do it yet */
@@ -620,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios)
620int _ore_add_parity_unit(struct ore_io_state *ios, 624int _ore_add_parity_unit(struct ore_io_state *ios,
621 struct ore_striping_info *si, 625 struct ore_striping_info *si,
622 struct ore_per_dev_state *per_dev, 626 struct ore_per_dev_state *per_dev,
623 unsigned cur_len) 627 unsigned cur_len, bool do_xor)
624{ 628{
625 if (ios->reading) { 629 if (ios->reading) {
626 if (per_dev->cur_sg >= ios->sgs_per_dev) { 630 if (per_dev->cur_sg >= ios->sgs_per_dev) {
@@ -640,17 +644,16 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
640 si->cur_pg = _sp2d_min_pg(sp2d); 644 si->cur_pg = _sp2d_min_pg(sp2d);
641 num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; 645 num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
642 646
643 if (!cur_len) /* If last stripe operate on parity comp */
644 si->cur_comp = sp2d->data_devs;
645
646 if (!per_dev->length) { 647 if (!per_dev->length) {
647 per_dev->offset += si->cur_pg * PAGE_SIZE; 648 per_dev->offset += si->cur_pg * PAGE_SIZE;
648 /* If first stripe, Read in all read4write pages 649 /* If first stripe, Read in all read4write pages
649 * (if needed) before we calculate the first parity. 650 * (if needed) before we calculate the first parity.
650 */ 651 */
651 _read_4_write_first_stripe(ios); 652 if (do_xor)
653 _read_4_write_first_stripe(ios);
652 } 654 }
653 if (!cur_len) /* If last stripe r4w pages of last stripe */ 655 if (!cur_len && do_xor)
656 /* If last stripe r4w pages of last stripe */
654 _read_4_write_last_stripe(ios); 657 _read_4_write_last_stripe(ios);
655 _read_4_write_execute(ios); 658 _read_4_write_execute(ios);
656 659
@@ -662,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
662 ++(ios->cur_par_page); 665 ++(ios->cur_par_page);
663 } 666 }
664 667
665 BUG_ON(si->cur_comp != sp2d->data_devs); 668 BUG_ON(si->cur_comp < sp2d->data_devs);
666 BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); 669 BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
667 670
668 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, 671 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
@@ -670,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
670 if (unlikely(ret)) 673 if (unlikely(ret))
671 return ret; 674 return ret;
672 675
673 /* TODO: raid6 if (last_parity_dev) */ 676 if (do_xor) {
674 _gen_xor_unit(sp2d); 677 _gen_xor_unit(sp2d);
675 _sp2d_reset(sp2d, ios->r4w, ios->private); 678 _sp2d_reset(sp2d, ios->r4w, ios->private);
679 }
676 } 680 }
677 return 0; 681 return 0;
678} 682}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index 2ffd2c3c6e46..cf6375d82129 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -31,24 +31,6 @@
31#define ORE_DBGMSG2(M...) do {} while (0) 31#define ORE_DBGMSG2(M...) do {} while (0)
32/* #define ORE_DBGMSG2 ORE_DBGMSG */ 32/* #define ORE_DBGMSG2 ORE_DBGMSG */
33 33
34/* Calculate the component order in a stripe. eg the logical data unit
35 * address within the stripe of @dev given the @par_dev of this stripe.
36 */
37static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
38 unsigned par_dev, unsigned dev)
39{
40 unsigned first_dev = dev - dev % devs_in_group;
41
42 dev -= first_dev;
43 par_dev -= first_dev;
44
45 if (devs_in_group == par_dev) /* The raid 0 case */
46 return dev / mirrors_p1;
47 /* raid4/5/6 case */
48 return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
49 mirrors_p1;
50}
51
52/* ios_raid.c stuff needed by ios.c */ 34/* ios_raid.c stuff needed by ios.c */
53int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); 35int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
54void _ore_free_raid_stuff(struct ore_io_state *ios); 36void _ore_free_raid_stuff(struct ore_io_state *ios);
@@ -56,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios);
56void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, 38void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
57 bool not_last); 39 bool not_last);
58int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, 40int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
59 struct ore_per_dev_state *per_dev, unsigned cur_len); 41 struct ore_per_dev_state *per_dev, unsigned cur_len,
42 bool do_xor);
60void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, 43void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
61 struct ore_striping_info *si, struct page *page); 44 struct ore_striping_info *si, struct page *page);
62static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, 45static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,