diff options
| author | Boaz Harrosh <bharrosh@panasas.com> | 2011-12-28 12:21:45 -0500 |
|---|---|---|
| committer | Boaz Harrosh <bharrosh@panasas.com> | 2012-01-08 03:43:13 -0500 |
| commit | 724577ca355795b0a25c93ccbeee927871ca1a77 (patch) | |
| tree | ca3cdb4e7f1b9ab7963503227e1ca555c3535b3f /fs/exofs | |
| parent | 361aba569f55dd159b850489a3538253afbb3973 (diff) | |
ore: Must support none-PAGE-aligned IO
NFS might send us offsets that are not PAGE aligned. So
we must read in the reminder of the first/last pages, in cases
we need it for Parity calculations.
We only add an sg segments to read the partial page. But
we don't mark it as read=true because it is a lock-for-write
page.
TODO: In some cases (IO spans a single unit) we can just
adjust the raid_unit offset/length, but this is left for
later Kernels.
[Bug in 3.2.0 Kernel]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs')
| -rw-r--r-- | fs/exofs/ore_raid.c | 72 |
1 files changed, 60 insertions, 12 deletions
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 414a2dfd9500..d222c77cfa1b 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c | |||
| @@ -328,8 +328,8 @@ static int _alloc_read_4_write(struct ore_io_state *ios) | |||
| 328 | /* @si contains info of the to-be-inserted page. Update of @si should be | 328 | /* @si contains info of the to-be-inserted page. Update of @si should be |
| 329 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... | 329 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... |
| 330 | */ | 330 | */ |
| 331 | static int _add_to_read_4_write(struct ore_io_state *ios, | 331 | static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, |
| 332 | struct ore_striping_info *si, struct page *page) | 332 | struct page *page, unsigned pg_len) |
| 333 | { | 333 | { |
| 334 | struct request_queue *q; | 334 | struct request_queue *q; |
| 335 | struct ore_per_dev_state *per_dev; | 335 | struct ore_per_dev_state *per_dev; |
| @@ -366,17 +366,60 @@ static int _add_to_read_4_write(struct ore_io_state *ios, | |||
| 366 | _ore_add_sg_seg(per_dev, gap, true); | 366 | _ore_add_sg_seg(per_dev, gap, true); |
| 367 | } | 367 | } |
| 368 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); | 368 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); |
| 369 | added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); | 369 | added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, |
| 370 | if (unlikely(added_len != PAGE_SIZE)) { | 370 | si->obj_offset % PAGE_SIZE); |
| 371 | if (unlikely(added_len != pg_len)) { | ||
| 371 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", | 372 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", |
| 372 | per_dev->bio->bi_vcnt); | 373 | per_dev->bio->bi_vcnt); |
| 373 | return -ENOMEM; | 374 | return -ENOMEM; |
| 374 | } | 375 | } |
| 375 | 376 | ||
| 376 | per_dev->length += PAGE_SIZE; | 377 | per_dev->length += pg_len; |
| 377 | return 0; | 378 | return 0; |
| 378 | } | 379 | } |
| 379 | 380 | ||
| 381 | /* read the beginning of an unaligned first page */ | ||
| 382 | static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) | ||
| 383 | { | ||
| 384 | struct ore_striping_info si; | ||
| 385 | unsigned pg_len; | ||
| 386 | |||
| 387 | ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); | ||
| 388 | |||
| 389 | pg_len = si.obj_offset % PAGE_SIZE; | ||
| 390 | si.obj_offset -= pg_len; | ||
| 391 | |||
| 392 | ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", | ||
| 393 | _LLU(si.obj_offset), pg_len, page->index, si.dev); | ||
| 394 | |||
| 395 | return _add_to_r4w(ios, &si, page, pg_len); | ||
| 396 | } | ||
| 397 | |||
| 398 | /* read the end of an incomplete last page */ | ||
| 399 | static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) | ||
| 400 | { | ||
| 401 | struct ore_striping_info si; | ||
| 402 | struct page *page; | ||
| 403 | unsigned pg_len, p, c; | ||
| 404 | |||
| 405 | ore_calc_stripe_info(ios->layout, *offset, 0, &si); | ||
| 406 | |||
| 407 | p = si.unit_off / PAGE_SIZE; | ||
| 408 | c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, | ||
| 409 | ios->layout->mirrors_p1, si.par_dev, si.dev); | ||
| 410 | page = ios->sp2d->_1p_stripes[p].pages[c]; | ||
| 411 | |||
| 412 | pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); | ||
| 413 | *offset += pg_len; | ||
| 414 | |||
| 415 | ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", | ||
| 416 | p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); | ||
| 417 | |||
| 418 | BUG_ON(!page); | ||
| 419 | |||
| 420 | return _add_to_r4w(ios, &si, page, pg_len); | ||
| 421 | } | ||
| 422 | |||
| 380 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) | 423 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) |
| 381 | { | 424 | { |
| 382 | struct bio_vec *bv; | 425 | struct bio_vec *bv; |
| @@ -444,9 +487,13 @@ static int _read_4_write(struct ore_io_state *ios) | |||
| 444 | struct page **pp = &_1ps->pages[c]; | 487 | struct page **pp = &_1ps->pages[c]; |
| 445 | bool uptodate; | 488 | bool uptodate; |
| 446 | 489 | ||
| 447 | if (*pp) | 490 | if (*pp) { |
| 491 | if (ios->offset % PAGE_SIZE) | ||
| 492 | /* Read the remainder of the page */ | ||
| 493 | _add_to_r4w_first_page(ios, *pp); | ||
| 448 | /* to-be-written pages start here */ | 494 | /* to-be-written pages start here */ |
| 449 | goto read_last_stripe; | 495 | goto read_last_stripe; |
| 496 | } | ||
| 450 | 497 | ||
| 451 | *pp = ios->r4w->get_page(ios->private, offset, | 498 | *pp = ios->r4w->get_page(ios->private, offset, |
| 452 | &uptodate); | 499 | &uptodate); |
| @@ -454,7 +501,7 @@ static int _read_4_write(struct ore_io_state *ios) | |||
| 454 | return -ENOMEM; | 501 | return -ENOMEM; |
| 455 | 502 | ||
| 456 | if (!uptodate) | 503 | if (!uptodate) |
| 457 | _add_to_read_4_write(ios, &read_si, *pp); | 504 | _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); |
| 458 | 505 | ||
| 459 | /* Mark read-pages to be cache_released */ | 506 | /* Mark read-pages to be cache_released */ |
| 460 | _1ps->page_is_read[c] = true; | 507 | _1ps->page_is_read[c] = true; |
| @@ -465,8 +512,11 @@ static int _read_4_write(struct ore_io_state *ios) | |||
| 465 | } | 512 | } |
| 466 | 513 | ||
| 467 | read_last_stripe: | 514 | read_last_stripe: |
| 468 | offset = ios->offset + (ios->length + PAGE_SIZE - 1) / | 515 | offset = ios->offset + ios->length; |
| 469 | PAGE_SIZE * PAGE_SIZE; | 516 | if (offset % PAGE_SIZE) |
| 517 | _add_to_r4w_last_page(ios, &offset); | ||
| 518 | /* offset will be aligned to next page */ | ||
| 519 | |||
| 470 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) | 520 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) |
| 471 | * bytes_in_stripe; | 521 | * bytes_in_stripe; |
| 472 | if (offset == last_stripe_end) /* Optimize for the aligned case */ | 522 | if (offset == last_stripe_end) /* Optimize for the aligned case */ |
| @@ -503,7 +553,7 @@ read_last_stripe: | |||
| 503 | /* Mark read-pages to be cache_released */ | 553 | /* Mark read-pages to be cache_released */ |
| 504 | _1ps->page_is_read[c] = true; | 554 | _1ps->page_is_read[c] = true; |
| 505 | if (!uptodate) | 555 | if (!uptodate) |
| 506 | _add_to_read_4_write(ios, &read_si, page); | 556 | _add_to_r4w(ios, &read_si, page, PAGE_SIZE); |
| 507 | } | 557 | } |
| 508 | 558 | ||
| 509 | offset += PAGE_SIZE; | 559 | offset += PAGE_SIZE; |
| @@ -616,8 +666,6 @@ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | |||
| 616 | return -ENOMEM; | 666 | return -ENOMEM; |
| 617 | } | 667 | } |
| 618 | 668 | ||
| 619 | BUG_ON(ios->offset % PAGE_SIZE); | ||
| 620 | |||
| 621 | /* Round io down to last full strip */ | 669 | /* Round io down to last full strip */ |
| 622 | first_stripe = div_u64(ios->offset, stripe_size); | 670 | first_stripe = div_u64(ios->offset, stripe_size); |
| 623 | last_stripe = div_u64(ios->offset + ios->length, stripe_size); | 671 | last_stripe = div_u64(ios->offset + ios->length, stripe_size); |
