aboutsummaryrefslogtreecommitdiffstats
path: root/fs/nfs/blocklayout/blocklayout.c
diff options
context:
space:
mode:
authorPeng Tao <bergwolf@gmail.com>2012-08-23 12:27:51 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2012-10-01 18:38:24 -0400
commitfe6e1e8d9fad86873eb74a26e80a8f91f9e870b5 (patch)
treed09ce2464d48894ec12d39571d79e1176b493cfb /fs/nfs/blocklayout/blocklayout.c
parent5d0e3a004f02bffab51f542fa1d5b2e2854d8545 (diff)
pnfsblock: fix partial page buffer wirte
If applications use flock to protect its write range, generic NFS will not do read-modify-write cycle at page cache level. Therefore LD should know how to handle non-sector aligned writes. Otherwise there will be data corruption. Cc: stable <stable@vger.kernel.org> Signed-off-by: Peng Tao <tao.peng@emc.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'fs/nfs/blocklayout/blocklayout.c')
-rw-r--r--fs/nfs/blocklayout/blocklayout.c177
1 files changed, 165 insertions, 12 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 3c61514599a5..a9fe644a12d1 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -162,25 +162,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
162 return bio; 162 return bio;
163} 163}
164 164
165static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, 165static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
166 sector_t isect, struct page *page, 166 sector_t isect, struct page *page,
167 struct pnfs_block_extent *be, 167 struct pnfs_block_extent *be,
168 void (*end_io)(struct bio *, int err), 168 void (*end_io)(struct bio *, int err),
169 struct parallel_io *par) 169 struct parallel_io *par,
170 unsigned int offset, int len)
170{ 171{
172 isect = isect + (offset >> SECTOR_SHIFT);
173 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
174 npg, rw, (unsigned long long)isect, offset, len);
171retry: 175retry:
172 if (!bio) { 176 if (!bio) {
173 bio = bl_alloc_init_bio(npg, isect, be, end_io, par); 177 bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
174 if (!bio) 178 if (!bio)
175 return ERR_PTR(-ENOMEM); 179 return ERR_PTR(-ENOMEM);
176 } 180 }
177 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 181 if (bio_add_page(bio, page, len, offset) < len) {
178 bio = bl_submit_bio(rw, bio); 182 bio = bl_submit_bio(rw, bio);
179 goto retry; 183 goto retry;
180 } 184 }
181 return bio; 185 return bio;
182} 186}
183 187
188static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
189 sector_t isect, struct page *page,
190 struct pnfs_block_extent *be,
191 void (*end_io)(struct bio *, int err),
192 struct parallel_io *par)
193{
194 return do_add_page_to_bio(bio, npg, rw, isect, page, be,
195 end_io, par, 0, PAGE_CACHE_SIZE);
196}
197
184/* This is basically copied from mpage_end_io_read */ 198/* This is basically copied from mpage_end_io_read */
185static void bl_end_io_read(struct bio *bio, int err) 199static void bl_end_io_read(struct bio *bio, int err)
186{ 200{
@@ -450,6 +464,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
450 return; 464 return;
451} 465}
452 466
467static void
468bl_read_single_end_io(struct bio *bio, int error)
469{
470 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
471 struct page *page = bvec->bv_page;
472
473 /* Only one page in bvec */
474 unlock_page(page);
475}
476
477static int
478bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
479 unsigned int offset, unsigned int len)
480{
481 struct bio *bio;
482 struct page *shadow_page;
483 sector_t isect;
484 char *kaddr, *kshadow_addr;
485 int ret = 0;
486
487 dprintk("%s: offset %u len %u\n", __func__, offset, len);
488
489 shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
490 if (shadow_page == NULL)
491 return -ENOMEM;
492
493 bio = bio_alloc(GFP_NOIO, 1);
494 if (bio == NULL)
495 return -ENOMEM;
496
497 isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
498 (offset / SECTOR_SIZE);
499
500 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
501 bio->bi_bdev = be->be_mdev;
502 bio->bi_end_io = bl_read_single_end_io;
503
504 lock_page(shadow_page);
505 if (bio_add_page(bio, shadow_page,
506 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
507 unlock_page(shadow_page);
508 bio_put(bio);
509 return -EIO;
510 }
511
512 submit_bio(READ, bio);
513 wait_on_page_locked(shadow_page);
514 if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
515 ret = -EIO;
516 } else {
517 kaddr = kmap_atomic(page);
518 kshadow_addr = kmap_atomic(shadow_page);
519 memcpy(kaddr + offset, kshadow_addr + offset, len);
520 kunmap_atomic(kshadow_addr);
521 kunmap_atomic(kaddr);
522 }
523 __free_page(shadow_page);
524 bio_put(bio);
525
526 return ret;
527}
528
529static int
530bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
531 unsigned int dirty_offset, unsigned int dirty_len,
532 bool full_page)
533{
534 int ret = 0;
535 unsigned int start, end;
536
537 if (full_page) {
538 start = 0;
539 end = PAGE_CACHE_SIZE;
540 } else {
541 start = round_down(dirty_offset, SECTOR_SIZE);
542 end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
543 }
544
545 dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
546 if (!be) {
547 zero_user_segments(page, start, dirty_offset,
548 dirty_offset + dirty_len, end);
549 if (start == 0 && end == PAGE_CACHE_SIZE &&
550 trylock_page(page)) {
551 SetPageUptodate(page);
552 unlock_page(page);
553 }
554 return ret;
555 }
556
557 if (start != dirty_offset)
558 ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
559
560 if (!ret && (dirty_offset + dirty_len < end))
561 ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
562 end - dirty_offset - dirty_len);
563
564 return ret;
565}
566
453/* Given an unmapped page, zero it or read in page for COW, page is locked 567/* Given an unmapped page, zero it or read in page for COW, page is locked
454 * by caller. 568 * by caller.
455 */ 569 */
@@ -483,7 +597,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
483 SetPageUptodate(page); 597 SetPageUptodate(page);
484 598
485cleanup: 599cleanup:
486 bl_put_extent(cow_read);
487 if (bh) 600 if (bh)
488 free_buffer_head(bh); 601 free_buffer_head(bh);
489 if (ret) { 602 if (ret) {
@@ -555,6 +668,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
555 struct parallel_io *par; 668 struct parallel_io *par;
556 loff_t offset = wdata->args.offset; 669 loff_t offset = wdata->args.offset;
557 size_t count = wdata->args.count; 670 size_t count = wdata->args.count;
671 unsigned int pg_offset, pg_len, saved_len;
558 struct page **pages = wdata->args.pages; 672 struct page **pages = wdata->args.pages;
559 struct page *page; 673 struct page *page;
560 pgoff_t index; 674 pgoff_t index;
@@ -659,10 +773,11 @@ next_page:
659 if (!extent_length) { 773 if (!extent_length) {
660 /* We've used up the previous extent */ 774 /* We've used up the previous extent */
661 bl_put_extent(be); 775 bl_put_extent(be);
776 bl_put_extent(cow_read);
662 bio = bl_submit_bio(WRITE, bio); 777 bio = bl_submit_bio(WRITE, bio);
663 /* Get the next one */ 778 /* Get the next one */
664 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 779 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
665 isect, NULL); 780 isect, &cow_read);
666 if (!be || !is_writable(be, isect)) { 781 if (!be || !is_writable(be, isect)) {
667 header->pnfs_error = -EINVAL; 782 header->pnfs_error = -EINVAL;
668 goto out; 783 goto out;
@@ -679,7 +794,26 @@ next_page:
679 extent_length = be->be_length - 794 extent_length = be->be_length -
680 (isect - be->be_f_offset); 795 (isect - be->be_f_offset);
681 } 796 }
682 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 797
798 dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
799 pg_offset = offset & ~PAGE_CACHE_MASK;
800 if (pg_offset + count > PAGE_CACHE_SIZE)
801 pg_len = PAGE_CACHE_SIZE - pg_offset;
802 else
803 pg_len = count;
804
805 saved_len = pg_len;
806 if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
807 !bl_is_sector_init(be->be_inval, isect)) {
808 ret = bl_read_partial_page_sync(pages[i], cow_read,
809 pg_offset, pg_len, true);
810 if (ret) {
811 dprintk("%s bl_read_partial_page_sync fail %d\n",
812 __func__, ret);
813 header->pnfs_error = ret;
814 goto out;
815 }
816
683 ret = bl_mark_sectors_init(be->be_inval, isect, 817 ret = bl_mark_sectors_init(be->be_inval, isect,
684 PAGE_CACHE_SECTORS); 818 PAGE_CACHE_SECTORS);
685 if (unlikely(ret)) { 819 if (unlikely(ret)) {
@@ -688,15 +822,35 @@ next_page:
688 header->pnfs_error = ret; 822 header->pnfs_error = ret;
689 goto out; 823 goto out;
690 } 824 }
825
826 /* Expand to full page write */
827 pg_offset = 0;
828 pg_len = PAGE_CACHE_SIZE;
829 } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
830 (pg_len & (SECTOR_SIZE - 1))){
831 /* ahh, nasty case. We have to do sync full sector
832 * read-modify-write cycles.
833 */
834 unsigned int saved_offset = pg_offset;
835 ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
836 pg_len, false);
837 pg_offset = round_down(pg_offset, SECTOR_SIZE);
838 pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
839 - pg_offset;
691 } 840 }
692 bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, 841
842
843 bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
693 isect, pages[i], be, 844 isect, pages[i], be,
694 bl_end_io_write, par); 845 bl_end_io_write, par,
846 pg_offset, pg_len);
695 if (IS_ERR(bio)) { 847 if (IS_ERR(bio)) {
696 header->pnfs_error = PTR_ERR(bio); 848 header->pnfs_error = PTR_ERR(bio);
697 bio = NULL; 849 bio = NULL;
698 goto out; 850 goto out;
699 } 851 }
852 offset += saved_len;
853 count -= saved_len;
700 isect += PAGE_CACHE_SECTORS; 854 isect += PAGE_CACHE_SECTORS;
701 last_isect = isect; 855 last_isect = isect;
702 extent_length -= PAGE_CACHE_SECTORS; 856 extent_length -= PAGE_CACHE_SECTORS;
@@ -714,17 +868,16 @@ next_page:
714 } 868 }
715 869
716write_done: 870write_done:
717 wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); 871 wdata->res.count = wdata->args.count;
718 if (count < wdata->res.count) {
719 wdata->res.count = count;
720 }
721out: 872out:
722 bl_put_extent(be); 873 bl_put_extent(be);
874 bl_put_extent(cow_read);
723 bl_submit_bio(WRITE, bio); 875 bl_submit_bio(WRITE, bio);
724 put_parallel(par); 876 put_parallel(par);
725 return PNFS_ATTEMPTED; 877 return PNFS_ATTEMPTED;
726out_mds: 878out_mds:
727 bl_put_extent(be); 879 bl_put_extent(be);
880 bl_put_extent(cow_read);
728 kfree(par); 881 kfree(par);
729 return PNFS_NOT_ATTEMPTED; 882 return PNFS_NOT_ATTEMPTED;
730} 883}