aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2012-06-07 18:19:07 -0400
committerBoaz Harrosh <bharrosh@panasas.com>2012-07-20 04:45:28 -0400
commit9ff19309a9623f2963ac5a136782ea4d8b5d67fb (patch)
tree3d91d486964cbae927a3ac8795679961f4200f8c /fs
parent6887a4131da3adaab011613776d865f4bcfb5678 (diff)
ore: Fix NFS crash by supporting any unaligned RAID IO
In RAID_5/6 We used to not permit an IO that it's end byte is not stripe_size aligned and spans more than one stripe. .i.e the caller must check if after submission the actual transferred bytes is shorter, and would need to resubmit a new IO with the remainder. Exofs supports this, and NFS was supposed to support this as well with it's short write mechanism. But late testing has exposed a CRASH when this is used with none-RPC layout-drivers. The change at NFS is deep and risky, in it's place the fix at ORE to lift the limitation is actually clean and simple. So here it is below. The principal here is that in the case of unaligned IO on both ends, beginning and end, we will send two read requests one like old code, before the calculation of the first stripe, and also a new site, before the calculation of the last stripe. If any "boundary" is aligned or the complete IO is within a single stripe. we do a single read like before. The code is clean and simple by splitting the old _read_4_write into 3 even parts: 1._read_4_write_first_stripe 2. _read_4_write_last_stripe 3. _read_4_write_execute And calling 1+3 at the same place as before. 2+3 before last stripe, and in the case of all in a single stripe then 1+2+3 is preformed additively. Why did I not think of it before. Well I had a strike of genius because I have stared at this code for 2 years, and did not find this simple solution, til today. Not that I did not try. This solution is much better for NFS than the previous supposedly solution because the short write was dealt with out-of-band after IO_done, which would cause for a seeky IO pattern where as in here we execute in order. At both solutions we do 2 separate reads, only here we do it within a single IO request. (And actually combine two writes into a single submission) NFS/exofs code need not change since the ORE API communicates the new shorter length on return, what will happen is that this case would not occur anymore. hurray!! [Stable this is an NFS bug since 3.2 Kernel should apply cleanly] CC: Stable Tree <stable@kernel.org> Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/exofs/ore_raid.c67
1 files changed, 36 insertions, 31 deletions
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index d222c77cfa1b..fff2070c6751 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -461,16 +461,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
461 * ios->sp2d[p][*], xor is calculated the same way. These pages are 461 * ios->sp2d[p][*], xor is calculated the same way. These pages are
462 * allocated/freed and don't go through cache 462 * allocated/freed and don't go through cache
463 */ 463 */
464static int _read_4_write(struct ore_io_state *ios) 464static int _read_4_write_first_stripe(struct ore_io_state *ios)
465{ 465{
466 struct ore_io_state *ios_read;
467 struct ore_striping_info read_si; 466 struct ore_striping_info read_si;
468 struct __stripe_pages_2d *sp2d = ios->sp2d; 467 struct __stripe_pages_2d *sp2d = ios->sp2d;
469 u64 offset = ios->si.first_stripe_start; 468 u64 offset = ios->si.first_stripe_start;
470 u64 last_stripe_end; 469 unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
471 unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
472 unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
473 int ret;
474 470
475 if (offset == ios->offset) /* Go to start collect $200 */ 471 if (offset == ios->offset) /* Go to start collect $200 */
476 goto read_last_stripe; 472 goto read_last_stripe;
@@ -478,6 +474,9 @@ static int _read_4_write(struct ore_io_state *ios)
478 min_p = _sp2d_min_pg(sp2d); 474 min_p = _sp2d_min_pg(sp2d);
479 max_p = _sp2d_max_pg(sp2d); 475 max_p = _sp2d_max_pg(sp2d);
480 476
477 ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
478 offset, ios->offset, min_p, max_p);
479
481 for (c = 0; ; c++) { 480 for (c = 0; ; c++) {
482 ore_calc_stripe_info(ios->layout, offset, 0, &read_si); 481 ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
483 read_si.obj_offset += min_p * PAGE_SIZE; 482 read_si.obj_offset += min_p * PAGE_SIZE;
@@ -512,6 +511,18 @@ static int _read_4_write(struct ore_io_state *ios)
512 } 511 }
513 512
514read_last_stripe: 513read_last_stripe:
514 return 0;
515}
516
517static int _read_4_write_last_stripe(struct ore_io_state *ios)
518{
519 struct ore_striping_info read_si;
520 struct __stripe_pages_2d *sp2d = ios->sp2d;
521 u64 offset;
522 u64 last_stripe_end;
523 unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
524 unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
525
515 offset = ios->offset + ios->length; 526 offset = ios->offset + ios->length;
516 if (offset % PAGE_SIZE) 527 if (offset % PAGE_SIZE)
517 _add_to_r4w_last_page(ios, &offset); 528 _add_to_r4w_last_page(ios, &offset);
@@ -527,15 +538,15 @@ read_last_stripe:
527 c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, 538 c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
528 ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); 539 ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
529 540
530 BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
531 /* unaligned IO must be within a single stripe */
532
533 if (min_p == sp2d->pages_in_unit) { 541 if (min_p == sp2d->pages_in_unit) {
534 /* Didn't do it yet */ 542 /* Didn't do it yet */
535 min_p = _sp2d_min_pg(sp2d); 543 min_p = _sp2d_min_pg(sp2d);
536 max_p = _sp2d_max_pg(sp2d); 544 max_p = _sp2d_max_pg(sp2d);
537 } 545 }
538 546
547 ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
548 offset, last_stripe_end, min_p, max_p);
549
539 while (offset < last_stripe_end) { 550 while (offset < last_stripe_end) {
540 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 551 struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
541 552
@@ -568,6 +579,15 @@ read_last_stripe:
568 } 579 }
569 580
570read_it: 581read_it:
582 return 0;
583}
584
585static int _read_4_write_execute(struct ore_io_state *ios)
586{
587 struct ore_io_state *ios_read;
588 unsigned i;
589 int ret;
590
571 ios_read = ios->ios_read_4_write; 591 ios_read = ios->ios_read_4_write;
572 if (!ios_read) 592 if (!ios_read)
573 return 0; 593 return 0;
@@ -591,6 +611,8 @@ read_it:
591 } 611 }
592 612
593 _mark_read4write_pages_uptodate(ios_read, ret); 613 _mark_read4write_pages_uptodate(ios_read, ret);
614 ore_put_io_state(ios_read);
615 ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
594 return 0; 616 return 0;
595} 617}
596 618
@@ -626,8 +648,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
626 /* If first stripe, Read in all read4write pages 648 /* If first stripe, Read in all read4write pages
627 * (if needed) before we calculate the first parity. 649 * (if needed) before we calculate the first parity.
628 */ 650 */
629 _read_4_write(ios); 651 _read_4_write_first_stripe(ios);
630 } 652 }
653 if (!cur_len) /* If last stripe r4w pages of last stripe */
654 _read_4_write_last_stripe(ios);
655 _read_4_write_execute(ios);
631 656
632 for (i = 0; i < num_pages; i++) { 657 for (i = 0; i < num_pages; i++) {
633 pages[i] = _raid_page_alloc(); 658 pages[i] = _raid_page_alloc();
@@ -654,34 +679,14 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
654 679
655int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) 680int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
656{ 681{
657 struct ore_layout *layout = ios->layout;
658
659 if (ios->parity_pages) { 682 if (ios->parity_pages) {
683 struct ore_layout *layout = ios->layout;
660 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; 684 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
661 unsigned stripe_size = ios->si.bytes_in_stripe;
662 u64 last_stripe, first_stripe;
663 685
664 if (_sp2d_alloc(pages_in_unit, layout->group_width, 686 if (_sp2d_alloc(pages_in_unit, layout->group_width,
665 layout->parity, &ios->sp2d)) { 687 layout->parity, &ios->sp2d)) {
666 return -ENOMEM; 688 return -ENOMEM;
667 } 689 }
668
669 /* Round io down to last full strip */
670 first_stripe = div_u64(ios->offset, stripe_size);
671 last_stripe = div_u64(ios->offset + ios->length, stripe_size);
672
673 /* If an IO spans more then a single stripe it must end at
674 * a stripe boundary. The reminder at the end is pushed into the
675 * next IO.
676 */
677 if (last_stripe != first_stripe) {
678 ios->length = last_stripe * stripe_size - ios->offset;
679
680 BUG_ON(!ios->length);
681 ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
682 PAGE_SIZE;
683 ios->si.length = ios->length; /*make it consistent */
684 }
685 } 690 }
686 return 0; 691 return 0;
687} 692}