diff options
author | Christoph Hellwig <hch@lst.de> | 2014-09-10 11:23:32 -0400 |
---|---|---|
committer | Trond Myklebust <trond.myklebust@primarydata.com> | 2014-09-10 15:47:03 -0400 |
commit | 3a6fd1f004fcaf3dd1c28a7cd16406c8318eb64a (patch) | |
tree | 192f1a9bac9e03c9fd3f78f0ac24ba40b59d688a /fs/nfs/blocklayout/blocklayout.c | |
parent | c88953d87f5c8cd95bebcbd6d15f2f0cdd348136 (diff) |
pnfs/blocklayout: remove read-modify-write handling in bl_write_pagelist
Use the new PNFS_READ_WHOLE_PAGE flag to offload read-modify-write
handling to core nfs code, and remove a huge chunk of deadlock prone
mess from the block layout writeback path.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Diffstat (limited to 'fs/nfs/blocklayout/blocklayout.c')
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.c | 498 |
1 files changed, 63 insertions, 435 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 87a633d03507..cf87254b6cd1 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c | |||
@@ -35,7 +35,6 @@ | |||
35 | #include <linux/mount.h> | 35 | #include <linux/mount.h> |
36 | #include <linux/namei.h> | 36 | #include <linux/namei.h> |
37 | #include <linux/bio.h> /* struct bio */ | 37 | #include <linux/bio.h> /* struct bio */ |
38 | #include <linux/buffer_head.h> /* various write calls */ | ||
39 | #include <linux/prefetch.h> | 38 | #include <linux/prefetch.h> |
40 | #include <linux/pagevec.h> | 39 | #include <linux/pagevec.h> |
41 | 40 | ||
@@ -188,16 +187,6 @@ retry: | |||
188 | return bio; | 187 | return bio; |
189 | } | 188 | } |
190 | 189 | ||
191 | static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, | ||
192 | sector_t isect, struct page *page, | ||
193 | struct pnfs_block_extent *be, | ||
194 | void (*end_io)(struct bio *, int err), | ||
195 | struct parallel_io *par) | ||
196 | { | ||
197 | return do_add_page_to_bio(bio, npg, rw, isect, page, be, | ||
198 | end_io, par, 0, PAGE_CACHE_SIZE); | ||
199 | } | ||
200 | |||
201 | /* This is basically copied from mpage_end_io_read */ | 190 | /* This is basically copied from mpage_end_io_read */ |
202 | static void bl_end_io_read(struct bio *bio, int err) | 191 | static void bl_end_io_read(struct bio *bio, int err) |
203 | { | 192 | { |
@@ -293,8 +282,8 @@ bl_read_pagelist(struct nfs_pgio_header *hdr) | |||
293 | } | 282 | } |
294 | } | 283 | } |
295 | 284 | ||
285 | pg_offset = f_offset & ~PAGE_CACHE_MASK; | ||
296 | if (is_dio) { | 286 | if (is_dio) { |
297 | pg_offset = f_offset & ~PAGE_CACHE_MASK; | ||
298 | if (pg_offset + bytes_left > PAGE_CACHE_SIZE) | 287 | if (pg_offset + bytes_left > PAGE_CACHE_SIZE) |
299 | pg_len = PAGE_CACHE_SIZE - pg_offset; | 288 | pg_len = PAGE_CACHE_SIZE - pg_offset; |
300 | else | 289 | else |
@@ -305,7 +294,7 @@ bl_read_pagelist(struct nfs_pgio_header *hdr) | |||
305 | isect += (pg_offset >> SECTOR_SHIFT); | 294 | isect += (pg_offset >> SECTOR_SHIFT); |
306 | extent_length -= (pg_offset >> SECTOR_SHIFT); | 295 | extent_length -= (pg_offset >> SECTOR_SHIFT); |
307 | } else { | 296 | } else { |
308 | pg_offset = 0; | 297 | BUG_ON(pg_offset != 0); |
309 | pg_len = PAGE_CACHE_SIZE; | 298 | pg_len = PAGE_CACHE_SIZE; |
310 | } | 299 | } |
311 | 300 | ||
@@ -383,29 +372,6 @@ static void mark_extents_written(struct pnfs_block_layout *bl, | |||
383 | } | 372 | } |
384 | } | 373 | } |
385 | 374 | ||
386 | static void bl_end_io_write_zero(struct bio *bio, int err) | ||
387 | { | ||
388 | struct parallel_io *par = bio->bi_private; | ||
389 | struct bio_vec *bvec; | ||
390 | int i; | ||
391 | |||
392 | bio_for_each_segment_all(bvec, bio, i) { | ||
393 | /* This is the zeroing page we added */ | ||
394 | end_page_writeback(bvec->bv_page); | ||
395 | page_cache_release(bvec->bv_page); | ||
396 | } | ||
397 | |||
398 | if (unlikely(err)) { | ||
399 | struct nfs_pgio_header *header = par->data; | ||
400 | |||
401 | if (!header->pnfs_error) | ||
402 | header->pnfs_error = -EIO; | ||
403 | pnfs_set_lo_fail(header->lseg); | ||
404 | } | ||
405 | bio_put(bio); | ||
406 | put_parallel(par); | ||
407 | } | ||
408 | |||
409 | static void bl_end_io_write(struct bio *bio, int err) | 375 | static void bl_end_io_write(struct bio *bio, int err) |
410 | { | 376 | { |
411 | struct parallel_io *par = bio->bi_private; | 377 | struct parallel_io *par = bio->bi_private; |
@@ -455,256 +421,22 @@ static void bl_end_par_io_write(void *data, int num_se) | |||
455 | schedule_work(&hdr->task.u.tk_work); | 421 | schedule_work(&hdr->task.u.tk_work); |
456 | } | 422 | } |
457 | 423 | ||
458 | /* FIXME STUB - mark intersection of layout and page as bad, so is not | ||
459 | * used again. | ||
460 | */ | ||
461 | static void mark_bad_read(void) | ||
462 | { | ||
463 | return; | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * map_block: map a requested I/0 block (isect) into an offset in the LVM | ||
468 | * block_device | ||
469 | */ | ||
470 | static void | ||
471 | map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) | ||
472 | { | ||
473 | dprintk("%s enter be=%p\n", __func__, be); | ||
474 | |||
475 | set_buffer_mapped(bh); | ||
476 | bh->b_bdev = be->be_mdev; | ||
477 | bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> | ||
478 | (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); | ||
479 | |||
480 | dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", | ||
481 | __func__, (unsigned long long)isect, (long)bh->b_blocknr, | ||
482 | bh->b_size); | ||
483 | return; | ||
484 | } | ||
485 | |||
486 | static void | ||
487 | bl_read_single_end_io(struct bio *bio, int error) | ||
488 | { | ||
489 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | ||
490 | struct page *page = bvec->bv_page; | ||
491 | |||
492 | /* Only one page in bvec */ | ||
493 | unlock_page(page); | ||
494 | } | ||
495 | |||
496 | static int | ||
497 | bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, | ||
498 | unsigned int offset, unsigned int len) | ||
499 | { | ||
500 | struct bio *bio; | ||
501 | struct page *shadow_page; | ||
502 | sector_t isect; | ||
503 | char *kaddr, *kshadow_addr; | ||
504 | int ret = 0; | ||
505 | |||
506 | dprintk("%s: offset %u len %u\n", __func__, offset, len); | ||
507 | |||
508 | shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
509 | if (shadow_page == NULL) | ||
510 | return -ENOMEM; | ||
511 | |||
512 | bio = bio_alloc(GFP_NOIO, 1); | ||
513 | if (bio == NULL) | ||
514 | return -ENOMEM; | ||
515 | |||
516 | isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + | ||
517 | (offset / SECTOR_SIZE); | ||
518 | |||
519 | bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset; | ||
520 | bio->bi_bdev = be->be_mdev; | ||
521 | bio->bi_end_io = bl_read_single_end_io; | ||
522 | |||
523 | lock_page(shadow_page); | ||
524 | if (bio_add_page(bio, shadow_page, | ||
525 | SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { | ||
526 | unlock_page(shadow_page); | ||
527 | bio_put(bio); | ||
528 | return -EIO; | ||
529 | } | ||
530 | |||
531 | submit_bio(READ, bio); | ||
532 | wait_on_page_locked(shadow_page); | ||
533 | if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { | ||
534 | ret = -EIO; | ||
535 | } else { | ||
536 | kaddr = kmap_atomic(page); | ||
537 | kshadow_addr = kmap_atomic(shadow_page); | ||
538 | memcpy(kaddr + offset, kshadow_addr + offset, len); | ||
539 | kunmap_atomic(kshadow_addr); | ||
540 | kunmap_atomic(kaddr); | ||
541 | } | ||
542 | __free_page(shadow_page); | ||
543 | bio_put(bio); | ||
544 | |||
545 | return ret; | ||
546 | } | ||
547 | |||
548 | static int | ||
549 | bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, | ||
550 | unsigned int dirty_offset, unsigned int dirty_len, | ||
551 | bool full_page) | ||
552 | { | ||
553 | int ret = 0; | ||
554 | unsigned int start, end; | ||
555 | |||
556 | if (full_page) { | ||
557 | start = 0; | ||
558 | end = PAGE_CACHE_SIZE; | ||
559 | } else { | ||
560 | start = round_down(dirty_offset, SECTOR_SIZE); | ||
561 | end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); | ||
562 | } | ||
563 | |||
564 | dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); | ||
565 | if (!be) { | ||
566 | zero_user_segments(page, start, dirty_offset, | ||
567 | dirty_offset + dirty_len, end); | ||
568 | if (start == 0 && end == PAGE_CACHE_SIZE && | ||
569 | trylock_page(page)) { | ||
570 | SetPageUptodate(page); | ||
571 | unlock_page(page); | ||
572 | } | ||
573 | return ret; | ||
574 | } | ||
575 | |||
576 | if (start != dirty_offset) | ||
577 | ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); | ||
578 | |||
579 | if (!ret && (dirty_offset + dirty_len < end)) | ||
580 | ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, | ||
581 | end - dirty_offset - dirty_len); | ||
582 | |||
583 | return ret; | ||
584 | } | ||
585 | |||
586 | /* Given an unmapped page, zero it or read in page for COW, page is locked | ||
587 | * by caller. | ||
588 | */ | ||
589 | static int | ||
590 | init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) | ||
591 | { | ||
592 | struct buffer_head *bh = NULL; | ||
593 | int ret = 0; | ||
594 | sector_t isect; | ||
595 | |||
596 | dprintk("%s enter, %p\n", __func__, page); | ||
597 | BUG_ON(PageUptodate(page)); | ||
598 | if (!cow_read) { | ||
599 | zero_user_segment(page, 0, PAGE_SIZE); | ||
600 | SetPageUptodate(page); | ||
601 | goto cleanup; | ||
602 | } | ||
603 | |||
604 | bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); | ||
605 | if (!bh) { | ||
606 | ret = -ENOMEM; | ||
607 | goto cleanup; | ||
608 | } | ||
609 | |||
610 | isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; | ||
611 | map_block(bh, isect, cow_read); | ||
612 | if (!bh_uptodate_or_lock(bh)) | ||
613 | ret = bh_submit_read(bh); | ||
614 | if (ret) | ||
615 | goto cleanup; | ||
616 | SetPageUptodate(page); | ||
617 | |||
618 | cleanup: | ||
619 | if (bh) | ||
620 | free_buffer_head(bh); | ||
621 | if (ret) { | ||
622 | /* Need to mark layout with bad read...should now | ||
623 | * just use nfs4 for reads and writes. | ||
624 | */ | ||
625 | mark_bad_read(); | ||
626 | } | ||
627 | return ret; | ||
628 | } | ||
629 | |||
630 | /* Find or create a zeroing page marked being writeback. | ||
631 | * Return ERR_PTR on error, NULL to indicate skip this page and page itself | ||
632 | * to indicate write out. | ||
633 | */ | ||
634 | static struct page * | ||
635 | bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, | ||
636 | struct pnfs_block_extent *cow_read) | ||
637 | { | ||
638 | struct page *page; | ||
639 | int locked = 0; | ||
640 | page = find_get_page(inode->i_mapping, index); | ||
641 | if (page) | ||
642 | goto check_page; | ||
643 | |||
644 | page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); | ||
645 | if (unlikely(!page)) { | ||
646 | dprintk("%s oom\n", __func__); | ||
647 | return ERR_PTR(-ENOMEM); | ||
648 | } | ||
649 | locked = 1; | ||
650 | |||
651 | check_page: | ||
652 | /* PageDirty: Other will write this out | ||
653 | * PageWriteback: Other is writing this out | ||
654 | * PageUptodate: It was read before | ||
655 | */ | ||
656 | if (PageDirty(page) || PageWriteback(page)) { | ||
657 | print_page(page); | ||
658 | if (locked) | ||
659 | unlock_page(page); | ||
660 | page_cache_release(page); | ||
661 | return NULL; | ||
662 | } | ||
663 | |||
664 | if (!locked) { | ||
665 | lock_page(page); | ||
666 | locked = 1; | ||
667 | goto check_page; | ||
668 | } | ||
669 | if (!PageUptodate(page)) { | ||
670 | /* New page, readin or zero it */ | ||
671 | init_page_for_write(page, cow_read); | ||
672 | } | ||
673 | set_page_writeback(page); | ||
674 | unlock_page(page); | ||
675 | |||
676 | return page; | ||
677 | } | ||
678 | |||
679 | static enum pnfs_try_status | 424 | static enum pnfs_try_status |
680 | bl_write_pagelist(struct nfs_pgio_header *header, int sync) | 425 | bl_write_pagelist(struct nfs_pgio_header *header, int sync) |
681 | { | 426 | { |
682 | int i, ret, npg_zero, pg_index, last = 0; | 427 | int i, ret; |
683 | struct bio *bio = NULL; | 428 | struct bio *bio = NULL; |
684 | struct pnfs_block_extent *be = NULL, *cow_read = NULL; | 429 | struct pnfs_block_extent *be = NULL; |
685 | sector_t isect, last_isect = 0, extent_length = 0; | 430 | sector_t isect, extent_length = 0; |
686 | struct parallel_io *par = NULL; | 431 | struct parallel_io *par = NULL; |
687 | loff_t offset = header->args.offset; | 432 | loff_t offset = header->args.offset; |
688 | size_t count = header->args.count; | 433 | size_t count = header->args.count; |
689 | unsigned int pg_offset, pg_len, saved_len; | ||
690 | struct page **pages = header->args.pages; | 434 | struct page **pages = header->args.pages; |
691 | struct page *page; | 435 | int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; |
692 | pgoff_t index; | ||
693 | u64 temp; | ||
694 | int npg_per_block = | ||
695 | NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; | ||
696 | struct blk_plug plug; | 436 | struct blk_plug plug; |
697 | 437 | ||
698 | dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); | 438 | dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); |
699 | 439 | ||
700 | blk_start_plug(&plug); | ||
701 | |||
702 | if (header->dreq != NULL && | ||
703 | (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) || | ||
704 | !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) { | ||
705 | dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n"); | ||
706 | goto out_mds; | ||
707 | } | ||
708 | /* At this point, header->page_aray is a (sequential) list of nfs_pages. | 440 | /* At this point, header->page_aray is a (sequential) list of nfs_pages. |
709 | * We want to write each, and if there is an error set pnfs_error | 441 | * We want to write each, and if there is an error set pnfs_error |
710 | * to have it redone using nfs. | 442 | * to have it redone using nfs. |
@@ -715,97 +447,20 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) | |||
715 | par->pnfs_callback = bl_end_par_io_write; | 447 | par->pnfs_callback = bl_end_par_io_write; |
716 | /* At this point, have to be more careful with error handling */ | 448 | /* At this point, have to be more careful with error handling */ |
717 | 449 | ||
718 | isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); | 450 | blk_start_plug(&plug); |
719 | be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read); | ||
720 | if (!be || !is_writable(be, isect)) { | ||
721 | dprintk("%s no matching extents!\n", __func__); | ||
722 | goto out_mds; | ||
723 | } | ||
724 | 451 | ||
725 | /* First page inside INVALID extent */ | 452 | /* we always write out the whole page */ |
726 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | 453 | offset = offset & (loff_t)PAGE_CACHE_MASK; |
727 | if (likely(!bl_push_one_short_extent(be->be_inval))) | 454 | isect = offset >> SECTOR_SHIFT; |
728 | par->bse_count++; | ||
729 | else | ||
730 | goto out_mds; | ||
731 | temp = offset >> PAGE_CACHE_SHIFT; | ||
732 | npg_zero = do_div(temp, npg_per_block); | ||
733 | isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & | ||
734 | (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); | ||
735 | extent_length = be->be_length - (isect - be->be_f_offset); | ||
736 | |||
737 | fill_invalid_ext: | ||
738 | dprintk("%s need to zero %d pages\n", __func__, npg_zero); | ||
739 | for (;npg_zero > 0; npg_zero--) { | ||
740 | if (bl_is_sector_init(be->be_inval, isect)) { | ||
741 | dprintk("isect %llu already init\n", | ||
742 | (unsigned long long)isect); | ||
743 | goto next_page; | ||
744 | } | ||
745 | /* page ref released in bl_end_io_write_zero */ | ||
746 | index = isect >> PAGE_CACHE_SECTOR_SHIFT; | ||
747 | dprintk("%s zero %dth page: index %lu isect %llu\n", | ||
748 | __func__, npg_zero, index, | ||
749 | (unsigned long long)isect); | ||
750 | page = bl_find_get_zeroing_page(header->inode, index, | ||
751 | cow_read); | ||
752 | if (unlikely(IS_ERR(page))) { | ||
753 | header->pnfs_error = PTR_ERR(page); | ||
754 | goto out; | ||
755 | } else if (page == NULL) | ||
756 | goto next_page; | ||
757 | 455 | ||
758 | ret = bl_mark_sectors_init(be->be_inval, isect, | ||
759 | PAGE_CACHE_SECTORS); | ||
760 | if (unlikely(ret)) { | ||
761 | dprintk("%s bl_mark_sectors_init fail %d\n", | ||
762 | __func__, ret); | ||
763 | end_page_writeback(page); | ||
764 | page_cache_release(page); | ||
765 | header->pnfs_error = ret; | ||
766 | goto out; | ||
767 | } | ||
768 | if (likely(!bl_push_one_short_extent(be->be_inval))) | ||
769 | par->bse_count++; | ||
770 | else { | ||
771 | end_page_writeback(page); | ||
772 | page_cache_release(page); | ||
773 | header->pnfs_error = -ENOMEM; | ||
774 | goto out; | ||
775 | } | ||
776 | /* FIXME: This should be done in bi_end_io */ | ||
777 | mark_extents_written(BLK_LSEG2EXT(header->lseg), | ||
778 | page->index << PAGE_CACHE_SHIFT, | ||
779 | PAGE_CACHE_SIZE); | ||
780 | |||
781 | bio = bl_add_page_to_bio(bio, npg_zero, WRITE, | ||
782 | isect, page, be, | ||
783 | bl_end_io_write_zero, par); | ||
784 | if (IS_ERR(bio)) { | ||
785 | header->pnfs_error = PTR_ERR(bio); | ||
786 | bio = NULL; | ||
787 | goto out; | ||
788 | } | ||
789 | next_page: | ||
790 | isect += PAGE_CACHE_SECTORS; | ||
791 | extent_length -= PAGE_CACHE_SECTORS; | ||
792 | } | ||
793 | if (last) | ||
794 | goto write_done; | ||
795 | } | ||
796 | bio = bl_submit_bio(WRITE, bio); | ||
797 | |||
798 | /* Middle pages */ | ||
799 | pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; | ||
800 | for (i = pg_index; i < header->page_array.npages; i++) { | 456 | for (i = pg_index; i < header->page_array.npages; i++) { |
801 | if (extent_length <= 0) { | 457 | if (extent_length <= 0) { |
802 | /* We've used up the previous extent */ | 458 | /* We've used up the previous extent */ |
803 | bl_put_extent(be); | 459 | bl_put_extent(be); |
804 | bl_put_extent(cow_read); | ||
805 | bio = bl_submit_bio(WRITE, bio); | 460 | bio = bl_submit_bio(WRITE, bio); |
806 | /* Get the next one */ | 461 | /* Get the next one */ |
807 | be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), | 462 | be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), |
808 | isect, &cow_read); | 463 | isect, NULL); |
809 | if (!be || !is_writable(be, isect)) { | 464 | if (!be || !is_writable(be, isect)) { |
810 | header->pnfs_error = -EINVAL; | 465 | header->pnfs_error = -EINVAL; |
811 | goto out; | 466 | goto out; |
@@ -823,25 +478,10 @@ next_page: | |||
823 | (isect - be->be_f_offset); | 478 | (isect - be->be_f_offset); |
824 | } | 479 | } |
825 | 480 | ||
826 | dprintk("%s offset %lld count %Zu\n", __func__, offset, count); | 481 | BUG_ON(offset & ~PAGE_CACHE_MASK); |
827 | pg_offset = offset & ~PAGE_CACHE_MASK; | ||
828 | if (pg_offset + count > PAGE_CACHE_SIZE) | ||
829 | pg_len = PAGE_CACHE_SIZE - pg_offset; | ||
830 | else | ||
831 | pg_len = count; | ||
832 | 482 | ||
833 | saved_len = pg_len; | ||
834 | if (be->be_state == PNFS_BLOCK_INVALID_DATA && | 483 | if (be->be_state == PNFS_BLOCK_INVALID_DATA && |
835 | !bl_is_sector_init(be->be_inval, isect)) { | 484 | !bl_is_sector_init(be->be_inval, isect)) { |
836 | ret = bl_read_partial_page_sync(pages[i], cow_read, | ||
837 | pg_offset, pg_len, true); | ||
838 | if (ret) { | ||
839 | dprintk("%s bl_read_partial_page_sync fail %d\n", | ||
840 | __func__, ret); | ||
841 | header->pnfs_error = ret; | ||
842 | goto out; | ||
843 | } | ||
844 | |||
845 | ret = bl_mark_sectors_init(be->be_inval, isect, | 485 | ret = bl_mark_sectors_init(be->be_inval, isect, |
846 | PAGE_CACHE_SECTORS); | 486 | PAGE_CACHE_SECTORS); |
847 | if (unlikely(ret)) { | 487 | if (unlikely(ret)) { |
@@ -850,66 +490,31 @@ next_page: | |||
850 | header->pnfs_error = ret; | 490 | header->pnfs_error = ret; |
851 | goto out; | 491 | goto out; |
852 | } | 492 | } |
853 | |||
854 | /* Expand to full page write */ | ||
855 | pg_offset = 0; | ||
856 | pg_len = PAGE_CACHE_SIZE; | ||
857 | } else if ((pg_offset & (SECTOR_SIZE - 1)) || | ||
858 | (pg_len & (SECTOR_SIZE - 1))){ | ||
859 | /* ahh, nasty case. We have to do sync full sector | ||
860 | * read-modify-write cycles. | ||
861 | */ | ||
862 | unsigned int saved_offset = pg_offset; | ||
863 | ret = bl_read_partial_page_sync(pages[i], be, pg_offset, | ||
864 | pg_len, false); | ||
865 | pg_offset = round_down(pg_offset, SECTOR_SIZE); | ||
866 | pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) | ||
867 | - pg_offset; | ||
868 | } | 493 | } |
869 | 494 | ||
870 | |||
871 | bio = do_add_page_to_bio(bio, header->page_array.npages - i, | 495 | bio = do_add_page_to_bio(bio, header->page_array.npages - i, |
872 | WRITE, | 496 | WRITE, isect, pages[i], be, |
873 | isect, pages[i], be, | ||
874 | bl_end_io_write, par, | 497 | bl_end_io_write, par, |
875 | pg_offset, pg_len); | 498 | 0, PAGE_CACHE_SIZE); |
876 | if (IS_ERR(bio)) { | 499 | if (IS_ERR(bio)) { |
877 | header->pnfs_error = PTR_ERR(bio); | 500 | header->pnfs_error = PTR_ERR(bio); |
878 | bio = NULL; | 501 | bio = NULL; |
879 | goto out; | 502 | goto out; |
880 | } | 503 | } |
881 | offset += saved_len; | 504 | offset += PAGE_CACHE_SIZE; |
882 | count -= saved_len; | 505 | count -= PAGE_CACHE_SIZE; |
883 | isect += PAGE_CACHE_SECTORS; | 506 | isect += PAGE_CACHE_SECTORS; |
884 | last_isect = isect; | ||
885 | extent_length -= PAGE_CACHE_SECTORS; | 507 | extent_length -= PAGE_CACHE_SECTORS; |
886 | } | 508 | } |
887 | 509 | ||
888 | /* Last page inside INVALID extent */ | ||
889 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
890 | bio = bl_submit_bio(WRITE, bio); | ||
891 | temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; | ||
892 | npg_zero = npg_per_block - do_div(temp, npg_per_block); | ||
893 | if (npg_zero < npg_per_block) { | ||
894 | last = 1; | ||
895 | goto fill_invalid_ext; | ||
896 | } | ||
897 | } | ||
898 | |||
899 | write_done: | ||
900 | header->res.count = header->args.count; | 510 | header->res.count = header->args.count; |
901 | out: | 511 | out: |
902 | bl_put_extent(be); | 512 | bl_put_extent(be); |
903 | bl_put_extent(cow_read); | ||
904 | bl_submit_bio(WRITE, bio); | 513 | bl_submit_bio(WRITE, bio); |
905 | blk_finish_plug(&plug); | 514 | blk_finish_plug(&plug); |
906 | put_parallel(par); | 515 | put_parallel(par); |
907 | return PNFS_ATTEMPTED; | 516 | return PNFS_ATTEMPTED; |
908 | out_mds: | 517 | out_mds: |
909 | blk_finish_plug(&plug); | ||
910 | bl_put_extent(be); | ||
911 | bl_put_extent(cow_read); | ||
912 | kfree(par); | ||
913 | return PNFS_NOT_ATTEMPTED; | 518 | return PNFS_NOT_ATTEMPTED; |
914 | } | 519 | } |
915 | 520 | ||
@@ -1188,20 +793,45 @@ bl_clear_layoutdriver(struct nfs_server *server) | |||
1188 | } | 793 | } |
1189 | 794 | ||
1190 | static bool | 795 | static bool |
1191 | is_aligned_req(struct nfs_page *req, unsigned int alignment) | 796 | is_aligned_req(struct nfs_pageio_descriptor *pgio, |
797 | struct nfs_page *req, unsigned int alignment) | ||
1192 | { | 798 | { |
1193 | return IS_ALIGNED(req->wb_offset, alignment) && | 799 | /* |
1194 | IS_ALIGNED(req->wb_bytes, alignment); | 800 | * Always accept buffered writes, higher layers take care of the |
801 | * right alignment. | ||
802 | */ | ||
803 | if (pgio->pg_dreq == NULL) | ||
804 | return true; | ||
805 | |||
806 | if (!IS_ALIGNED(req->wb_offset, alignment)) | ||
807 | return false; | ||
808 | |||
809 | if (IS_ALIGNED(req->wb_bytes, alignment)) | ||
810 | return true; | ||
811 | |||
812 | if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) { | ||
813 | /* | ||
814 | * If the write goes up to the inode size, just write | ||
815 | * the full page. Data past the inode size is | ||
816 | * guaranteed to be zeroed by the higher level client | ||
817 | * code, and this behaviour is mandated by RFC 5663 | ||
818 | * section 2.3.2. | ||
819 | */ | ||
820 | return true; | ||
821 | } | ||
822 | |||
823 | return false; | ||
1195 | } | 824 | } |
1196 | 825 | ||
1197 | static void | 826 | static void |
1198 | bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) | 827 | bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) |
1199 | { | 828 | { |
1200 | if (pgio->pg_dreq != NULL && | 829 | if (!is_aligned_req(pgio, req, SECTOR_SIZE)) { |
1201 | !is_aligned_req(req, SECTOR_SIZE)) | ||
1202 | nfs_pageio_reset_read_mds(pgio); | 830 | nfs_pageio_reset_read_mds(pgio); |
1203 | else | 831 | return; |
1204 | pnfs_generic_pg_init_read(pgio, req); | 832 | } |
833 | |||
834 | pnfs_generic_pg_init_read(pgio, req); | ||
1205 | } | 835 | } |
1206 | 836 | ||
1207 | /* | 837 | /* |
@@ -1212,10 +842,8 @@ static size_t | |||
1212 | bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, | 842 | bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, |
1213 | struct nfs_page *req) | 843 | struct nfs_page *req) |
1214 | { | 844 | { |
1215 | if (pgio->pg_dreq != NULL && | 845 | if (!is_aligned_req(pgio, req, SECTOR_SIZE)) |
1216 | !is_aligned_req(req, SECTOR_SIZE)) | ||
1217 | return 0; | 846 | return 0; |
1218 | |||
1219 | return pnfs_generic_pg_test(pgio, prev, req); | 847 | return pnfs_generic_pg_test(pgio, prev, req); |
1220 | } | 848 | } |
1221 | 849 | ||
@@ -1245,19 +873,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) | |||
1245 | static void | 873 | static void |
1246 | bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) | 874 | bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) |
1247 | { | 875 | { |
1248 | if (pgio->pg_dreq != NULL && | 876 | u64 wb_size; |
1249 | !is_aligned_req(req, PAGE_CACHE_SIZE)) { | 877 | |
878 | if (!is_aligned_req(pgio, req, PAGE_SIZE)) { | ||
1250 | nfs_pageio_reset_write_mds(pgio); | 879 | nfs_pageio_reset_write_mds(pgio); |
1251 | } else { | 880 | return; |
1252 | u64 wb_size; | ||
1253 | if (pgio->pg_dreq == NULL) | ||
1254 | wb_size = pnfs_num_cont_bytes(pgio->pg_inode, | ||
1255 | req->wb_index); | ||
1256 | else | ||
1257 | wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); | ||
1258 | |||
1259 | pnfs_generic_pg_init_write(pgio, req, wb_size); | ||
1260 | } | 881 | } |
882 | |||
883 | if (pgio->pg_dreq == NULL) | ||
884 | wb_size = pnfs_num_cont_bytes(pgio->pg_inode, | ||
885 | req->wb_index); | ||
886 | else | ||
887 | wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); | ||
888 | |||
889 | pnfs_generic_pg_init_write(pgio, req, wb_size); | ||
1261 | } | 890 | } |
1262 | 891 | ||
1263 | /* | 892 | /* |
@@ -1268,10 +897,8 @@ static size_t | |||
1268 | bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, | 897 | bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, |
1269 | struct nfs_page *req) | 898 | struct nfs_page *req) |
1270 | { | 899 | { |
1271 | if (pgio->pg_dreq != NULL && | 900 | if (!is_aligned_req(pgio, req, PAGE_SIZE)) |
1272 | !is_aligned_req(req, PAGE_CACHE_SIZE)) | ||
1273 | return 0; | 901 | return 0; |
1274 | |||
1275 | return pnfs_generic_pg_test(pgio, prev, req); | 902 | return pnfs_generic_pg_test(pgio, prev, req); |
1276 | } | 903 | } |
1277 | 904 | ||
@@ -1291,6 +918,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { | |||
1291 | .id = LAYOUT_BLOCK_VOLUME, | 918 | .id = LAYOUT_BLOCK_VOLUME, |
1292 | .name = "LAYOUT_BLOCK_VOLUME", | 919 | .name = "LAYOUT_BLOCK_VOLUME", |
1293 | .owner = THIS_MODULE, | 920 | .owner = THIS_MODULE, |
921 | .flags = PNFS_READ_WHOLE_PAGE, | ||
1294 | .read_pagelist = bl_read_pagelist, | 922 | .read_pagelist = bl_read_pagelist, |
1295 | .write_pagelist = bl_write_pagelist, | 923 | .write_pagelist = bl_write_pagelist, |
1296 | .alloc_layout_hdr = bl_alloc_layout_hdr, | 924 | .alloc_layout_hdr = bl_alloc_layout_hdr, |