diff options
author | Boaz Harrosh <bharrosh@panasas.com> | 2011-12-28 12:21:45 -0500 |
---|---|---|
committer | Boaz Harrosh <bharrosh@panasas.com> | 2012-01-08 03:43:13 -0500 |
commit | 724577ca355795b0a25c93ccbeee927871ca1a77 (patch) | |
tree | ca3cdb4e7f1b9ab7963503227e1ca555c3535b3f | |
parent | 361aba569f55dd159b850489a3538253afbb3973 (diff) |
ore: Must support none-PAGE-aligned IO
NFS might send us offsets that are not PAGE aligned. So
we must read in the reminder of the first/last pages, in cases
we need it for Parity calculations.
We only add an sg segments to read the partial page. But
we don't mark it as read=true because it is a lock-for-write
page.
TODO: In some cases (IO spans a single unit) we can just
adjust the raid_unit offset/length, but this is left for
later Kernels.
[Bug in 3.2.0 Kernel]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
-rw-r--r-- | fs/exofs/ore_raid.c | 72 |
1 files changed, 60 insertions, 12 deletions
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 414a2dfd950..d222c77cfa1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c | |||
@@ -328,8 +328,8 @@ static int _alloc_read_4_write(struct ore_io_state *ios) | |||
328 | /* @si contains info of the to-be-inserted page. Update of @si should be | 328 | /* @si contains info of the to-be-inserted page. Update of @si should be |
329 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... | 329 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... |
330 | */ | 330 | */ |
331 | static int _add_to_read_4_write(struct ore_io_state *ios, | 331 | static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, |
332 | struct ore_striping_info *si, struct page *page) | 332 | struct page *page, unsigned pg_len) |
333 | { | 333 | { |
334 | struct request_queue *q; | 334 | struct request_queue *q; |
335 | struct ore_per_dev_state *per_dev; | 335 | struct ore_per_dev_state *per_dev; |
@@ -366,17 +366,60 @@ static int _add_to_read_4_write(struct ore_io_state *ios, | |||
366 | _ore_add_sg_seg(per_dev, gap, true); | 366 | _ore_add_sg_seg(per_dev, gap, true); |
367 | } | 367 | } |
368 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); | 368 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); |
369 | added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); | 369 | added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, |
370 | if (unlikely(added_len != PAGE_SIZE)) { | 370 | si->obj_offset % PAGE_SIZE); |
371 | if (unlikely(added_len != pg_len)) { | ||
371 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", | 372 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", |
372 | per_dev->bio->bi_vcnt); | 373 | per_dev->bio->bi_vcnt); |
373 | return -ENOMEM; | 374 | return -ENOMEM; |
374 | } | 375 | } |
375 | 376 | ||
376 | per_dev->length += PAGE_SIZE; | 377 | per_dev->length += pg_len; |
377 | return 0; | 378 | return 0; |
378 | } | 379 | } |
379 | 380 | ||
381 | /* read the beginning of an unaligned first page */ | ||
382 | static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) | ||
383 | { | ||
384 | struct ore_striping_info si; | ||
385 | unsigned pg_len; | ||
386 | |||
387 | ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); | ||
388 | |||
389 | pg_len = si.obj_offset % PAGE_SIZE; | ||
390 | si.obj_offset -= pg_len; | ||
391 | |||
392 | ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", | ||
393 | _LLU(si.obj_offset), pg_len, page->index, si.dev); | ||
394 | |||
395 | return _add_to_r4w(ios, &si, page, pg_len); | ||
396 | } | ||
397 | |||
398 | /* read the end of an incomplete last page */ | ||
399 | static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) | ||
400 | { | ||
401 | struct ore_striping_info si; | ||
402 | struct page *page; | ||
403 | unsigned pg_len, p, c; | ||
404 | |||
405 | ore_calc_stripe_info(ios->layout, *offset, 0, &si); | ||
406 | |||
407 | p = si.unit_off / PAGE_SIZE; | ||
408 | c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, | ||
409 | ios->layout->mirrors_p1, si.par_dev, si.dev); | ||
410 | page = ios->sp2d->_1p_stripes[p].pages[c]; | ||
411 | |||
412 | pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); | ||
413 | *offset += pg_len; | ||
414 | |||
415 | ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", | ||
416 | p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); | ||
417 | |||
418 | BUG_ON(!page); | ||
419 | |||
420 | return _add_to_r4w(ios, &si, page, pg_len); | ||
421 | } | ||
422 | |||
380 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) | 423 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) |
381 | { | 424 | { |
382 | struct bio_vec *bv; | 425 | struct bio_vec *bv; |
@@ -444,9 +487,13 @@ static int _read_4_write(struct ore_io_state *ios) | |||
444 | struct page **pp = &_1ps->pages[c]; | 487 | struct page **pp = &_1ps->pages[c]; |
445 | bool uptodate; | 488 | bool uptodate; |
446 | 489 | ||
447 | if (*pp) | 490 | if (*pp) { |
491 | if (ios->offset % PAGE_SIZE) | ||
492 | /* Read the remainder of the page */ | ||
493 | _add_to_r4w_first_page(ios, *pp); | ||
448 | /* to-be-written pages start here */ | 494 | /* to-be-written pages start here */ |
449 | goto read_last_stripe; | 495 | goto read_last_stripe; |
496 | } | ||
450 | 497 | ||
451 | *pp = ios->r4w->get_page(ios->private, offset, | 498 | *pp = ios->r4w->get_page(ios->private, offset, |
452 | &uptodate); | 499 | &uptodate); |
@@ -454,7 +501,7 @@ static int _read_4_write(struct ore_io_state *ios) | |||
454 | return -ENOMEM; | 501 | return -ENOMEM; |
455 | 502 | ||
456 | if (!uptodate) | 503 | if (!uptodate) |
457 | _add_to_read_4_write(ios, &read_si, *pp); | 504 | _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); |
458 | 505 | ||
459 | /* Mark read-pages to be cache_released */ | 506 | /* Mark read-pages to be cache_released */ |
460 | _1ps->page_is_read[c] = true; | 507 | _1ps->page_is_read[c] = true; |
@@ -465,8 +512,11 @@ static int _read_4_write(struct ore_io_state *ios) | |||
465 | } | 512 | } |
466 | 513 | ||
467 | read_last_stripe: | 514 | read_last_stripe: |
468 | offset = ios->offset + (ios->length + PAGE_SIZE - 1) / | 515 | offset = ios->offset + ios->length; |
469 | PAGE_SIZE * PAGE_SIZE; | 516 | if (offset % PAGE_SIZE) |
517 | _add_to_r4w_last_page(ios, &offset); | ||
518 | /* offset will be aligned to next page */ | ||
519 | |||
470 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) | 520 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) |
471 | * bytes_in_stripe; | 521 | * bytes_in_stripe; |
472 | if (offset == last_stripe_end) /* Optimize for the aligned case */ | 522 | if (offset == last_stripe_end) /* Optimize for the aligned case */ |
@@ -503,7 +553,7 @@ read_last_stripe: | |||
503 | /* Mark read-pages to be cache_released */ | 553 | /* Mark read-pages to be cache_released */ |
504 | _1ps->page_is_read[c] = true; | 554 | _1ps->page_is_read[c] = true; |
505 | if (!uptodate) | 555 | if (!uptodate) |
506 | _add_to_read_4_write(ios, &read_si, page); | 556 | _add_to_r4w(ios, &read_si, page, PAGE_SIZE); |
507 | } | 557 | } |
508 | 558 | ||
509 | offset += PAGE_SIZE; | 559 | offset += PAGE_SIZE; |
@@ -616,8 +666,6 @@ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | |||
616 | return -ENOMEM; | 666 | return -ENOMEM; |
617 | } | 667 | } |
618 | 668 | ||
619 | BUG_ON(ios->offset % PAGE_SIZE); | ||
620 | |||
621 | /* Round io down to last full strip */ | 669 | /* Round io down to last full strip */ |
622 | first_stripe = div_u64(ios->offset, stripe_size); | 670 | first_stripe = div_u64(ios->offset, stripe_size); |
623 | last_stripe = div_u64(ios->offset + ios->length, stripe_size); | 671 | last_stripe = div_u64(ios->offset + ios->length, stripe_size); |