diff options
author | Peng Tao <bergwolf@gmail.com> | 2012-08-23 12:27:51 -0400 |
---|---|---|
committer | Trond Myklebust <Trond.Myklebust@netapp.com> | 2012-10-01 18:38:24 -0400 |
commit | fe6e1e8d9fad86873eb74a26e80a8f91f9e870b5 (patch) | |
tree | d09ce2464d48894ec12d39571d79e1176b493cfb /fs/nfs/blocklayout/blocklayout.c | |
parent | 5d0e3a004f02bffab51f542fa1d5b2e2854d8545 (diff) |
pnfsblock: fix partial page buffer wirte
If applications use flock to protect its write range, generic NFS
will not do read-modify-write cycle at page cache level. Therefore
LD should know how to handle non-sector aligned writes. Otherwise
there will be data corruption.
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Peng Tao <tao.peng@emc.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'fs/nfs/blocklayout/blocklayout.c')
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.c | 177 |
1 files changed, 165 insertions, 12 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 3c61514599a5..a9fe644a12d1 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c | |||
@@ -162,25 +162,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, | |||
162 | return bio; | 162 | return bio; |
163 | } | 163 | } |
164 | 164 | ||
165 | static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, | 165 | static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, |
166 | sector_t isect, struct page *page, | 166 | sector_t isect, struct page *page, |
167 | struct pnfs_block_extent *be, | 167 | struct pnfs_block_extent *be, |
168 | void (*end_io)(struct bio *, int err), | 168 | void (*end_io)(struct bio *, int err), |
169 | struct parallel_io *par) | 169 | struct parallel_io *par, |
170 | unsigned int offset, int len) | ||
170 | { | 171 | { |
172 | isect = isect + (offset >> SECTOR_SHIFT); | ||
173 | dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, | ||
174 | npg, rw, (unsigned long long)isect, offset, len); | ||
171 | retry: | 175 | retry: |
172 | if (!bio) { | 176 | if (!bio) { |
173 | bio = bl_alloc_init_bio(npg, isect, be, end_io, par); | 177 | bio = bl_alloc_init_bio(npg, isect, be, end_io, par); |
174 | if (!bio) | 178 | if (!bio) |
175 | return ERR_PTR(-ENOMEM); | 179 | return ERR_PTR(-ENOMEM); |
176 | } | 180 | } |
177 | if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { | 181 | if (bio_add_page(bio, page, len, offset) < len) { |
178 | bio = bl_submit_bio(rw, bio); | 182 | bio = bl_submit_bio(rw, bio); |
179 | goto retry; | 183 | goto retry; |
180 | } | 184 | } |
181 | return bio; | 185 | return bio; |
182 | } | 186 | } |
183 | 187 | ||
188 | static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, | ||
189 | sector_t isect, struct page *page, | ||
190 | struct pnfs_block_extent *be, | ||
191 | void (*end_io)(struct bio *, int err), | ||
192 | struct parallel_io *par) | ||
193 | { | ||
194 | return do_add_page_to_bio(bio, npg, rw, isect, page, be, | ||
195 | end_io, par, 0, PAGE_CACHE_SIZE); | ||
196 | } | ||
197 | |||
184 | /* This is basically copied from mpage_end_io_read */ | 198 | /* This is basically copied from mpage_end_io_read */ |
185 | static void bl_end_io_read(struct bio *bio, int err) | 199 | static void bl_end_io_read(struct bio *bio, int err) |
186 | { | 200 | { |
@@ -450,6 +464,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) | |||
450 | return; | 464 | return; |
451 | } | 465 | } |
452 | 466 | ||
467 | static void | ||
468 | bl_read_single_end_io(struct bio *bio, int error) | ||
469 | { | ||
470 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | ||
471 | struct page *page = bvec->bv_page; | ||
472 | |||
473 | /* Only one page in bvec */ | ||
474 | unlock_page(page); | ||
475 | } | ||
476 | |||
477 | static int | ||
478 | bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, | ||
479 | unsigned int offset, unsigned int len) | ||
480 | { | ||
481 | struct bio *bio; | ||
482 | struct page *shadow_page; | ||
483 | sector_t isect; | ||
484 | char *kaddr, *kshadow_addr; | ||
485 | int ret = 0; | ||
486 | |||
487 | dprintk("%s: offset %u len %u\n", __func__, offset, len); | ||
488 | |||
489 | shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
490 | if (shadow_page == NULL) | ||
491 | return -ENOMEM; | ||
492 | |||
493 | bio = bio_alloc(GFP_NOIO, 1); | ||
494 | if (bio == NULL) | ||
495 | return -ENOMEM; | ||
496 | |||
497 | isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + | ||
498 | (offset / SECTOR_SIZE); | ||
499 | |||
500 | bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; | ||
501 | bio->bi_bdev = be->be_mdev; | ||
502 | bio->bi_end_io = bl_read_single_end_io; | ||
503 | |||
504 | lock_page(shadow_page); | ||
505 | if (bio_add_page(bio, shadow_page, | ||
506 | SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { | ||
507 | unlock_page(shadow_page); | ||
508 | bio_put(bio); | ||
509 | return -EIO; | ||
510 | } | ||
511 | |||
512 | submit_bio(READ, bio); | ||
513 | wait_on_page_locked(shadow_page); | ||
514 | if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { | ||
515 | ret = -EIO; | ||
516 | } else { | ||
517 | kaddr = kmap_atomic(page); | ||
518 | kshadow_addr = kmap_atomic(shadow_page); | ||
519 | memcpy(kaddr + offset, kshadow_addr + offset, len); | ||
520 | kunmap_atomic(kshadow_addr); | ||
521 | kunmap_atomic(kaddr); | ||
522 | } | ||
523 | __free_page(shadow_page); | ||
524 | bio_put(bio); | ||
525 | |||
526 | return ret; | ||
527 | } | ||
528 | |||
529 | static int | ||
530 | bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, | ||
531 | unsigned int dirty_offset, unsigned int dirty_len, | ||
532 | bool full_page) | ||
533 | { | ||
534 | int ret = 0; | ||
535 | unsigned int start, end; | ||
536 | |||
537 | if (full_page) { | ||
538 | start = 0; | ||
539 | end = PAGE_CACHE_SIZE; | ||
540 | } else { | ||
541 | start = round_down(dirty_offset, SECTOR_SIZE); | ||
542 | end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); | ||
543 | } | ||
544 | |||
545 | dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); | ||
546 | if (!be) { | ||
547 | zero_user_segments(page, start, dirty_offset, | ||
548 | dirty_offset + dirty_len, end); | ||
549 | if (start == 0 && end == PAGE_CACHE_SIZE && | ||
550 | trylock_page(page)) { | ||
551 | SetPageUptodate(page); | ||
552 | unlock_page(page); | ||
553 | } | ||
554 | return ret; | ||
555 | } | ||
556 | |||
557 | if (start != dirty_offset) | ||
558 | ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); | ||
559 | |||
560 | if (!ret && (dirty_offset + dirty_len < end)) | ||
561 | ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, | ||
562 | end - dirty_offset - dirty_len); | ||
563 | |||
564 | return ret; | ||
565 | } | ||
566 | |||
453 | /* Given an unmapped page, zero it or read in page for COW, page is locked | 567 | /* Given an unmapped page, zero it or read in page for COW, page is locked |
454 | * by caller. | 568 | * by caller. |
455 | */ | 569 | */ |
@@ -483,7 +597,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) | |||
483 | SetPageUptodate(page); | 597 | SetPageUptodate(page); |
484 | 598 | ||
485 | cleanup: | 599 | cleanup: |
486 | bl_put_extent(cow_read); | ||
487 | if (bh) | 600 | if (bh) |
488 | free_buffer_head(bh); | 601 | free_buffer_head(bh); |
489 | if (ret) { | 602 | if (ret) { |
@@ -555,6 +668,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) | |||
555 | struct parallel_io *par; | 668 | struct parallel_io *par; |
556 | loff_t offset = wdata->args.offset; | 669 | loff_t offset = wdata->args.offset; |
557 | size_t count = wdata->args.count; | 670 | size_t count = wdata->args.count; |
671 | unsigned int pg_offset, pg_len, saved_len; | ||
558 | struct page **pages = wdata->args.pages; | 672 | struct page **pages = wdata->args.pages; |
559 | struct page *page; | 673 | struct page *page; |
560 | pgoff_t index; | 674 | pgoff_t index; |
@@ -659,10 +773,11 @@ next_page: | |||
659 | if (!extent_length) { | 773 | if (!extent_length) { |
660 | /* We've used up the previous extent */ | 774 | /* We've used up the previous extent */ |
661 | bl_put_extent(be); | 775 | bl_put_extent(be); |
776 | bl_put_extent(cow_read); | ||
662 | bio = bl_submit_bio(WRITE, bio); | 777 | bio = bl_submit_bio(WRITE, bio); |
663 | /* Get the next one */ | 778 | /* Get the next one */ |
664 | be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), | 779 | be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), |
665 | isect, NULL); | 780 | isect, &cow_read); |
666 | if (!be || !is_writable(be, isect)) { | 781 | if (!be || !is_writable(be, isect)) { |
667 | header->pnfs_error = -EINVAL; | 782 | header->pnfs_error = -EINVAL; |
668 | goto out; | 783 | goto out; |
@@ -679,7 +794,26 @@ next_page: | |||
679 | extent_length = be->be_length - | 794 | extent_length = be->be_length - |
680 | (isect - be->be_f_offset); | 795 | (isect - be->be_f_offset); |
681 | } | 796 | } |
682 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | 797 | |
798 | dprintk("%s offset %lld count %Zu\n", __func__, offset, count); | ||
799 | pg_offset = offset & ~PAGE_CACHE_MASK; | ||
800 | if (pg_offset + count > PAGE_CACHE_SIZE) | ||
801 | pg_len = PAGE_CACHE_SIZE - pg_offset; | ||
802 | else | ||
803 | pg_len = count; | ||
804 | |||
805 | saved_len = pg_len; | ||
806 | if (be->be_state == PNFS_BLOCK_INVALID_DATA && | ||
807 | !bl_is_sector_init(be->be_inval, isect)) { | ||
808 | ret = bl_read_partial_page_sync(pages[i], cow_read, | ||
809 | pg_offset, pg_len, true); | ||
810 | if (ret) { | ||
811 | dprintk("%s bl_read_partial_page_sync fail %d\n", | ||
812 | __func__, ret); | ||
813 | header->pnfs_error = ret; | ||
814 | goto out; | ||
815 | } | ||
816 | |||
683 | ret = bl_mark_sectors_init(be->be_inval, isect, | 817 | ret = bl_mark_sectors_init(be->be_inval, isect, |
684 | PAGE_CACHE_SECTORS); | 818 | PAGE_CACHE_SECTORS); |
685 | if (unlikely(ret)) { | 819 | if (unlikely(ret)) { |
@@ -688,15 +822,35 @@ next_page: | |||
688 | header->pnfs_error = ret; | 822 | header->pnfs_error = ret; |
689 | goto out; | 823 | goto out; |
690 | } | 824 | } |
825 | |||
826 | /* Expand to full page write */ | ||
827 | pg_offset = 0; | ||
828 | pg_len = PAGE_CACHE_SIZE; | ||
829 | } else if ((pg_offset & (SECTOR_SIZE - 1)) || | ||
830 | (pg_len & (SECTOR_SIZE - 1))){ | ||
831 | /* ahh, nasty case. We have to do sync full sector | ||
832 | * read-modify-write cycles. | ||
833 | */ | ||
834 | unsigned int saved_offset = pg_offset; | ||
835 | ret = bl_read_partial_page_sync(pages[i], be, pg_offset, | ||
836 | pg_len, false); | ||
837 | pg_offset = round_down(pg_offset, SECTOR_SIZE); | ||
838 | pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) | ||
839 | - pg_offset; | ||
691 | } | 840 | } |
692 | bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, | 841 | |
842 | |||
843 | bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, | ||
693 | isect, pages[i], be, | 844 | isect, pages[i], be, |
694 | bl_end_io_write, par); | 845 | bl_end_io_write, par, |
846 | pg_offset, pg_len); | ||
695 | if (IS_ERR(bio)) { | 847 | if (IS_ERR(bio)) { |
696 | header->pnfs_error = PTR_ERR(bio); | 848 | header->pnfs_error = PTR_ERR(bio); |
697 | bio = NULL; | 849 | bio = NULL; |
698 | goto out; | 850 | goto out; |
699 | } | 851 | } |
852 | offset += saved_len; | ||
853 | count -= saved_len; | ||
700 | isect += PAGE_CACHE_SECTORS; | 854 | isect += PAGE_CACHE_SECTORS; |
701 | last_isect = isect; | 855 | last_isect = isect; |
702 | extent_length -= PAGE_CACHE_SECTORS; | 856 | extent_length -= PAGE_CACHE_SECTORS; |
@@ -714,17 +868,16 @@ next_page: | |||
714 | } | 868 | } |
715 | 869 | ||
716 | write_done: | 870 | write_done: |
717 | wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); | 871 | wdata->res.count = wdata->args.count; |
718 | if (count < wdata->res.count) { | ||
719 | wdata->res.count = count; | ||
720 | } | ||
721 | out: | 872 | out: |
722 | bl_put_extent(be); | 873 | bl_put_extent(be); |
874 | bl_put_extent(cow_read); | ||
723 | bl_submit_bio(WRITE, bio); | 875 | bl_submit_bio(WRITE, bio); |
724 | put_parallel(par); | 876 | put_parallel(par); |
725 | return PNFS_ATTEMPTED; | 877 | return PNFS_ATTEMPTED; |
726 | out_mds: | 878 | out_mds: |
727 | bl_put_extent(be); | 879 | bl_put_extent(be); |
880 | bl_put_extent(cow_read); | ||
728 | kfree(par); | 881 | kfree(par); |
729 | return PNFS_NOT_ATTEMPTED; | 882 | return PNFS_NOT_ATTEMPTED; |
730 | } | 883 | } |