diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2007-05-08 20:47:32 -0400 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2007-07-10 20:31:46 -0400 |
commit | 3a307ffc2730bfa1a4dfa94537be9d412338aad2 (patch) | |
tree | 3e83201eb816aec8b897afcf3920dd716ce4412c | |
parent | 2e89b2e48e1da09ed483f195968c9172aa95b5e2 (diff) |
ocfs2: rework ocfs2_buffered_write_cluster()
Use some ideas from the new-aops patch series and turn
ocfs2_buffered_write_cluster() into a 2 stage operation with the caller
copying data in between. The code now understands multiple cluster writes as
a result of having to deal with a full page write for greater than 4k pages.
This sets us up to easily call into the write path during ->page_mkwrite().
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
-rw-r--r-- | fs/ocfs2/aops.c | 812 | ||||
-rw-r--r-- | fs/ocfs2/aops.h | 56 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 121 |
3 files changed, 551 insertions, 438 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a480b09c79b9..3e5758ebd932 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
684 | bh = bh->b_this_page, block_start += bsize) { | 684 | bh = bh->b_this_page, block_start += bsize) { |
685 | block_end = block_start + bsize; | 685 | block_end = block_start + bsize; |
686 | 686 | ||
687 | clear_buffer_new(bh); | ||
688 | |||
687 | /* | 689 | /* |
688 | * Ignore blocks outside of our i/o range - | 690 | * Ignore blocks outside of our i/o range - |
689 | * they may belong to unallocated clusters. | 691 | * they may belong to unallocated clusters. |
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
698 | * For an allocating write with cluster size >= page | 700 | * For an allocating write with cluster size >= page |
699 | * size, we always write the entire page. | 701 | * size, we always write the entire page. |
700 | */ | 702 | */ |
701 | 703 | if (new) | |
702 | if (buffer_new(bh)) | 704 | set_buffer_new(bh); |
703 | clear_buffer_new(bh); | ||
704 | 705 | ||
705 | if (!buffer_mapped(bh)) { | 706 | if (!buffer_mapped(bh)) { |
706 | map_bh(bh, inode->i_sb, *p_blkno); | 707 | map_bh(bh, inode->i_sb, *p_blkno); |
@@ -761,217 +762,232 @@ next_bh: | |||
761 | return ret; | 762 | return ret; |
762 | } | 763 | } |
763 | 764 | ||
765 | #if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) | ||
766 | #define OCFS2_MAX_CTXT_PAGES 1 | ||
767 | #else | ||
768 | #define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) | ||
769 | #endif | ||
770 | |||
771 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) | ||
772 | |||
764 | /* | 773 | /* |
765 | * This will copy user data from the buffer page in the splice | 774 | * Describe the state of a single cluster to be written to. |
766 | * context. | ||
767 | * | ||
768 | * For now, we ignore SPLICE_F_MOVE as that would require some extra | ||
769 | * communication out all the way to ocfs2_write(). | ||
770 | */ | 775 | */ |
771 | int ocfs2_map_and_write_splice_data(struct inode *inode, | 776 | struct ocfs2_write_cluster_desc { |
772 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | 777 | u32 c_cpos; |
773 | unsigned int *ret_from, unsigned int *ret_to) | 778 | u32 c_phys; |
774 | { | 779 | /* |
775 | int ret; | 780 | * Give this a unique field because c_phys eventually gets |
776 | unsigned int to, from, cluster_start, cluster_end; | 781 | * filled. |
777 | char *src, *dst; | 782 | */ |
778 | struct ocfs2_splice_write_priv *sp = wc->w_private; | 783 | unsigned c_new; |
779 | struct pipe_buffer *buf = sp->s_buf; | 784 | }; |
780 | unsigned long bytes, src_from; | ||
781 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
782 | 785 | ||
783 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | 786 | struct ocfs2_write_ctxt { |
784 | &cluster_end); | 787 | /* Logical cluster position / len of write */ |
788 | u32 w_cpos; | ||
789 | u32 w_clen; | ||
785 | 790 | ||
786 | from = sp->s_offset; | 791 | struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; |
787 | src_from = sp->s_buf_offset; | ||
788 | bytes = wc->w_count; | ||
789 | 792 | ||
790 | if (wc->w_large_pages) { | 793 | /* |
791 | /* | 794 | * This is true if page_size > cluster_size. |
792 | * For cluster size < page size, we have to | 795 | * |
793 | * calculate pos within the cluster and obey | 796 | * It triggers a set of special cases during write which might |
794 | * the rightmost boundary. | 797 | * have to deal with allocating writes to partial pages. |
795 | */ | 798 | */ |
796 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | 799 | unsigned int w_large_pages; |
797 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
798 | } | ||
799 | to = from + bytes; | ||
800 | 800 | ||
801 | BUG_ON(from > PAGE_CACHE_SIZE); | 801 | /* |
802 | BUG_ON(to > PAGE_CACHE_SIZE); | 802 | * Pages involved in this write. |
803 | BUG_ON(from < cluster_start); | 803 | * |
804 | BUG_ON(to > cluster_end); | 804 | * w_target_page is the page being written to by the user. |
805 | * | ||
806 | * w_pages is an array of pages which always contains | ||
807 | * w_target_page, and in the case of an allocating write with | ||
808 | * page_size < cluster size, it will contain zero'd and mapped | ||
809 | * pages adjacent to w_target_page which need to be written | ||
810 | * out in so that future reads from that region will get | ||
811 | * zero's. | ||
812 | */ | ||
813 | struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; | ||
814 | unsigned int w_num_pages; | ||
815 | struct page *w_target_page; | ||
805 | 816 | ||
806 | if (wc->w_this_page_new) | 817 | /* |
807 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 818 | * ocfs2_write_end() uses this to know what the real range to |
808 | cluster_start, cluster_end, 1); | 819 | * write in the target should be. |
809 | else | 820 | */ |
810 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 821 | unsigned int w_target_from; |
811 | from, to, 0); | 822 | unsigned int w_target_to; |
812 | if (ret) { | 823 | |
813 | mlog_errno(ret); | 824 | /* |
814 | goto out; | 825 | * We could use journal_current_handle() but this is cleaner, |
826 | * IMHO -Mark | ||
827 | */ | ||
828 | handle_t *w_handle; | ||
829 | |||
830 | struct buffer_head *w_di_bh; | ||
831 | }; | ||
832 | |||
833 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | ||
834 | { | ||
835 | int i; | ||
836 | |||
837 | for(i = 0; i < wc->w_num_pages; i++) { | ||
838 | if (wc->w_pages[i] == NULL) | ||
839 | continue; | ||
840 | |||
841 | unlock_page(wc->w_pages[i]); | ||
842 | mark_page_accessed(wc->w_pages[i]); | ||
843 | page_cache_release(wc->w_pages[i]); | ||
815 | } | 844 | } |
816 | 845 | ||
817 | src = buf->ops->map(sp->s_pipe, buf, 1); | 846 | brelse(wc->w_di_bh); |
818 | dst = kmap_atomic(wc->w_this_page, KM_USER1); | 847 | kfree(wc); |
819 | memcpy(dst + from, src + src_from, bytes); | 848 | } |
820 | kunmap_atomic(wc->w_this_page, KM_USER1); | 849 | |
821 | buf->ops->unmap(sp->s_pipe, buf, src); | 850 | static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, |
851 | struct ocfs2_super *osb, loff_t pos, | ||
852 | unsigned len) | ||
853 | { | ||
854 | struct ocfs2_write_ctxt *wc; | ||
855 | |||
856 | wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); | ||
857 | if (!wc) | ||
858 | return -ENOMEM; | ||
822 | 859 | ||
823 | wc->w_finished_copy = 1; | 860 | wc->w_cpos = pos >> osb->s_clustersize_bits; |
861 | wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len); | ||
824 | 862 | ||
825 | *ret_from = from; | 863 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) |
826 | *ret_to = to; | 864 | wc->w_large_pages = 1; |
827 | out: | 865 | else |
866 | wc->w_large_pages = 0; | ||
867 | |||
868 | *wcp = wc; | ||
828 | 869 | ||
829 | return bytes ? (unsigned int)bytes : ret; | 870 | return 0; |
830 | } | 871 | } |
831 | 872 | ||
832 | /* | 873 | /* |
833 | * This will copy user data from the iovec in the buffered write | 874 | * If a page has any new buffers, zero them out here, and mark them uptodate |
834 | * context. | 875 | * and dirty so they'll be written out (in order to prevent uninitialised |
876 | * block data from leaking). And clear the new bit. | ||
835 | */ | 877 | */ |
836 | int ocfs2_map_and_write_user_data(struct inode *inode, | 878 | static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) |
837 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
838 | unsigned int *ret_from, unsigned int *ret_to) | ||
839 | { | 879 | { |
840 | int ret; | 880 | unsigned int block_start, block_end; |
841 | unsigned int to, from, cluster_start, cluster_end; | 881 | struct buffer_head *head, *bh; |
842 | unsigned long bytes, src_from; | ||
843 | char *dst; | ||
844 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
845 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
846 | char __user *buf; | ||
847 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
848 | 882 | ||
849 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | 883 | BUG_ON(!PageLocked(page)); |
850 | &cluster_end); | 884 | if (!page_has_buffers(page)) |
885 | return; | ||
851 | 886 | ||
852 | buf = cur_iov->iov_base + bp->b_cur_off; | 887 | bh = head = page_buffers(page); |
853 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | 888 | block_start = 0; |
889 | do { | ||
890 | block_end = block_start + bh->b_size; | ||
891 | |||
892 | if (buffer_new(bh)) { | ||
893 | if (block_end > from && block_start < to) { | ||
894 | if (!PageUptodate(page)) { | ||
895 | unsigned start, end; | ||
896 | void *kaddr; | ||
897 | |||
898 | start = max(from, block_start); | ||
899 | end = min(to, block_end); | ||
900 | |||
901 | kaddr = kmap_atomic(page, KM_USER0); | ||
902 | memset(kaddr+start, 0, end - start); | ||
903 | flush_dcache_page(page); | ||
904 | kunmap_atomic(kaddr, KM_USER0); | ||
905 | set_buffer_uptodate(bh); | ||
906 | } | ||
907 | |||
908 | clear_buffer_new(bh); | ||
909 | mark_buffer_dirty(bh); | ||
910 | } | ||
911 | } | ||
854 | 912 | ||
855 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | 913 | block_start = block_end; |
914 | bh = bh->b_this_page; | ||
915 | } while (bh != head); | ||
916 | } | ||
917 | |||
918 | /* | ||
919 | * Only called when we have a failure during allocating write to write | ||
920 | * zero's to the newly allocated region. | ||
921 | */ | ||
922 | static void ocfs2_write_failure(struct inode *inode, | ||
923 | struct ocfs2_write_ctxt *wc, | ||
924 | loff_t user_pos, unsigned user_len) | ||
925 | { | ||
926 | int i; | ||
927 | unsigned from, to; | ||
928 | struct page *tmppage; | ||
929 | |||
930 | ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len); | ||
856 | 931 | ||
857 | /* | ||
858 | * This is a lot of comparisons, but it reads quite | ||
859 | * easily, which is important here. | ||
860 | */ | ||
861 | /* Stay within the src page */ | ||
862 | bytes = PAGE_SIZE - src_from; | ||
863 | /* Stay within the vector */ | ||
864 | bytes = min(bytes, | ||
865 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
866 | /* Stay within count */ | ||
867 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
868 | /* | ||
869 | * For clustersize > page size, just stay within | ||
870 | * target page, otherwise we have to calculate pos | ||
871 | * within the cluster and obey the rightmost | ||
872 | * boundary. | ||
873 | */ | ||
874 | if (wc->w_large_pages) { | 932 | if (wc->w_large_pages) { |
875 | /* | 933 | from = wc->w_target_from; |
876 | * For cluster size < page size, we have to | 934 | to = wc->w_target_to; |
877 | * calculate pos within the cluster and obey | ||
878 | * the rightmost boundary. | ||
879 | */ | ||
880 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
881 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
882 | } else { | 935 | } else { |
883 | /* | 936 | from = 0; |
884 | * cluster size > page size is the most common | 937 | to = PAGE_CACHE_SIZE; |
885 | * case - we just stay within the target page | ||
886 | * boundary. | ||
887 | */ | ||
888 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
889 | } | 938 | } |
890 | 939 | ||
891 | to = from + bytes; | 940 | for(i = 0; i < wc->w_num_pages; i++) { |
941 | tmppage = wc->w_pages[i]; | ||
892 | 942 | ||
893 | BUG_ON(from > PAGE_CACHE_SIZE); | 943 | if (ocfs2_should_order_data(inode)) |
894 | BUG_ON(to > PAGE_CACHE_SIZE); | 944 | walk_page_buffers(wc->w_handle, page_buffers(tmppage), |
895 | BUG_ON(from < cluster_start); | 945 | from, to, NULL, |
896 | BUG_ON(to > cluster_end); | 946 | ocfs2_journal_dirty_data); |
897 | 947 | ||
898 | if (wc->w_this_page_new) | 948 | block_commit_write(tmppage, from, to); |
899 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
900 | cluster_start, cluster_end, 1); | ||
901 | else | ||
902 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
903 | from, to, 0); | ||
904 | if (ret) { | ||
905 | mlog_errno(ret); | ||
906 | goto out; | ||
907 | } | 949 | } |
908 | |||
909 | dst = kmap(wc->w_this_page); | ||
910 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | ||
911 | kunmap(wc->w_this_page); | ||
912 | |||
913 | /* | ||
914 | * XXX: This is slow, but simple. The caller of | ||
915 | * ocfs2_buffered_write_cluster() is responsible for | ||
916 | * passing through the iovecs, so it's difficult to | ||
917 | * predict what our next step is in here after our | ||
918 | * initial write. A future version should be pushing | ||
919 | * that iovec manipulation further down. | ||
920 | * | ||
921 | * By setting this, we indicate that a copy from user | ||
922 | * data was done, and subsequent calls for this | ||
923 | * cluster will skip copying more data. | ||
924 | */ | ||
925 | wc->w_finished_copy = 1; | ||
926 | |||
927 | *ret_from = from; | ||
928 | *ret_to = to; | ||
929 | out: | ||
930 | |||
931 | return bytes ? (unsigned int)bytes : ret; | ||
932 | } | 950 | } |
933 | 951 | ||
934 | /* | 952 | static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, |
935 | * Map, fill and write a page to disk. | 953 | struct ocfs2_write_ctxt *wc, |
936 | * | 954 | struct page *page, u32 cpos, |
937 | * The work of copying data is done via callback. Newly allocated | 955 | loff_t user_pos, unsigned user_len, |
938 | * pages which don't take user data will be zero'd (set 'new' to | 956 | int new) |
939 | * indicate an allocating write) | ||
940 | * | ||
941 | * Returns a negative error code or the number of bytes copied into | ||
942 | * the page. | ||
943 | */ | ||
944 | static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
945 | u64 *p_blkno, struct page *page, | ||
946 | struct ocfs2_write_ctxt *wc, int new) | ||
947 | { | 957 | { |
948 | int ret, copied = 0; | 958 | int ret; |
949 | unsigned int from = 0, to = 0; | 959 | unsigned int map_from = 0, map_to = 0; |
950 | unsigned int cluster_start, cluster_end; | 960 | unsigned int cluster_start, cluster_end; |
951 | unsigned int zero_from = 0, zero_to = 0; | 961 | unsigned int user_data_from = 0, user_data_to = 0; |
952 | 962 | ||
953 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | 963 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, |
954 | &cluster_start, &cluster_end); | 964 | &cluster_start, &cluster_end); |
955 | 965 | ||
956 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | 966 | if (page == wc->w_target_page) { |
957 | && !wc->w_finished_copy) { | 967 | map_from = user_pos & (PAGE_CACHE_SIZE - 1); |
958 | 968 | map_to = map_from + user_len; | |
959 | wc->w_this_page = page; | 969 | |
960 | wc->w_this_page_new = new; | 970 | if (new) |
961 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | 971 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, |
962 | if (ret < 0) { | 972 | cluster_start, cluster_end, |
973 | new); | ||
974 | else | ||
975 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
976 | map_from, map_to, new); | ||
977 | if (ret) { | ||
963 | mlog_errno(ret); | 978 | mlog_errno(ret); |
964 | goto out; | 979 | goto out; |
965 | } | 980 | } |
966 | 981 | ||
967 | copied = ret; | 982 | user_data_from = map_from; |
968 | 983 | user_data_to = map_to; | |
969 | zero_from = from; | ||
970 | zero_to = to; | ||
971 | if (new) { | 984 | if (new) { |
972 | from = cluster_start; | 985 | map_from = cluster_start; |
973 | to = cluster_end; | 986 | map_to = cluster_end; |
974 | } | 987 | } |
988 | |||
989 | wc->w_target_from = map_from; | ||
990 | wc->w_target_to = map_to; | ||
975 | } else { | 991 | } else { |
976 | /* | 992 | /* |
977 | * If we haven't allocated the new page yet, we | 993 | * If we haven't allocated the new page yet, we |
@@ -980,11 +996,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | |||
980 | */ | 996 | */ |
981 | BUG_ON(!new); | 997 | BUG_ON(!new); |
982 | 998 | ||
983 | from = cluster_start; | 999 | map_from = cluster_start; |
984 | to = cluster_end; | 1000 | map_to = cluster_end; |
985 | 1001 | ||
986 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | 1002 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, |
987 | cluster_start, cluster_end, 1); | 1003 | cluster_start, cluster_end, new); |
988 | if (ret) { | 1004 | if (ret) { |
989 | mlog_errno(ret); | 1005 | mlog_errno(ret); |
990 | goto out; | 1006 | goto out; |
@@ -1003,108 +1019,84 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | |||
1003 | */ | 1019 | */ |
1004 | if (new && !PageUptodate(page)) | 1020 | if (new && !PageUptodate(page)) |
1005 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | 1021 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), |
1006 | wc->w_cpos, zero_from, zero_to); | 1022 | cpos, user_data_from, user_data_to); |
1007 | 1023 | ||
1008 | flush_dcache_page(page); | 1024 | flush_dcache_page(page); |
1009 | 1025 | ||
1010 | if (ocfs2_should_order_data(inode)) { | ||
1011 | ret = walk_page_buffers(handle, | ||
1012 | page_buffers(page), | ||
1013 | from, to, NULL, | ||
1014 | ocfs2_journal_dirty_data); | ||
1015 | if (ret < 0) | ||
1016 | mlog_errno(ret); | ||
1017 | } | ||
1018 | |||
1019 | /* | ||
1020 | * We don't use generic_commit_write() because we need to | ||
1021 | * handle our own i_size update. | ||
1022 | */ | ||
1023 | ret = block_commit_write(page, from, to); | ||
1024 | if (ret) | ||
1025 | mlog_errno(ret); | ||
1026 | out: | 1026 | out: |
1027 | 1027 | return ret; | |
1028 | return copied ? copied : ret; | ||
1029 | } | 1028 | } |
1030 | 1029 | ||
1031 | /* | 1030 | /* |
1032 | * Do the actual write of some data into an inode. Optionally allocate | 1031 | * This function will only grab one clusters worth of pages. |
1033 | * in order to fulfill the write. | ||
1034 | * | ||
1035 | * cpos is the logical cluster offset within the file to write at | ||
1036 | * | ||
1037 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
1038 | * zero indicates that allocation is required. In this case, data_ac | ||
1039 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
1040 | * allocation isn't required). | ||
1041 | */ | 1032 | */ |
1042 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | 1033 | static int ocfs2_grab_pages_for_write(struct address_space *mapping, |
1043 | struct buffer_head *di_bh, | 1034 | struct ocfs2_write_ctxt *wc, |
1044 | struct ocfs2_alloc_context *data_ac, | 1035 | u32 cpos, loff_t user_pos, int new) |
1045 | struct ocfs2_alloc_context *meta_ac, | ||
1046 | struct ocfs2_write_ctxt *wc) | ||
1047 | { | 1036 | { |
1048 | int ret, i, numpages = 1, new; | 1037 | int ret = 0, i; |
1049 | unsigned int copied = 0; | 1038 | unsigned long start, target_index, index; |
1050 | u32 tmp_pos; | ||
1051 | u64 v_blkno, p_blkno; | ||
1052 | struct address_space *mapping = file->f_mapping; | ||
1053 | struct inode *inode = mapping->host; | 1039 | struct inode *inode = mapping->host; |
1054 | unsigned long index, start; | ||
1055 | struct page **cpages; | ||
1056 | 1040 | ||
1057 | new = phys == 0 ? 1 : 0; | 1041 | target_index = user_pos >> PAGE_CACHE_SHIFT; |
1058 | 1042 | ||
1059 | /* | 1043 | /* |
1060 | * Figure out how many pages we'll be manipulating here. For | 1044 | * Figure out how many pages we'll be manipulating here. For |
1061 | * non allocating write, we just change the one | 1045 | * non allocating write, we just change the one |
1062 | * page. Otherwise, we'll need a whole clusters worth. | 1046 | * page. Otherwise, we'll need a whole clusters worth. |
1063 | */ | 1047 | */ |
1064 | if (new) | ||
1065 | numpages = ocfs2_pages_per_cluster(inode->i_sb); | ||
1066 | |||
1067 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
1068 | if (!cpages) { | ||
1069 | ret = -ENOMEM; | ||
1070 | mlog_errno(ret); | ||
1071 | return ret; | ||
1072 | } | ||
1073 | |||
1074 | /* | ||
1075 | * Fill our page array first. That way we've grabbed enough so | ||
1076 | * that we can zero and flush if we error after adding the | ||
1077 | * extent. | ||
1078 | */ | ||
1079 | if (new) { | 1048 | if (new) { |
1080 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | 1049 | wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); |
1081 | wc->w_cpos); | 1050 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); |
1082 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
1083 | } else { | 1051 | } else { |
1084 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | 1052 | wc->w_num_pages = 1; |
1085 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | 1053 | start = target_index; |
1086 | } | 1054 | } |
1087 | 1055 | ||
1088 | for(i = 0; i < numpages; i++) { | 1056 | for(i = 0; i < wc->w_num_pages; i++) { |
1089 | index = start + i; | 1057 | index = start + i; |
1090 | 1058 | ||
1091 | cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); | 1059 | wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); |
1092 | if (!cpages[i]) { | 1060 | if (!wc->w_pages[i]) { |
1093 | ret = -ENOMEM; | 1061 | ret = -ENOMEM; |
1094 | mlog_errno(ret); | 1062 | mlog_errno(ret); |
1095 | goto out; | 1063 | goto out; |
1096 | } | 1064 | } |
1065 | |||
1066 | if (index == target_index) | ||
1067 | wc->w_target_page = wc->w_pages[i]; | ||
1097 | } | 1068 | } |
1069 | out: | ||
1070 | return ret; | ||
1071 | } | ||
1072 | |||
1073 | /* | ||
1074 | * Prepare a single cluster for write one cluster into the file. | ||
1075 | */ | ||
1076 | static int ocfs2_write_cluster(struct address_space *mapping, | ||
1077 | u32 phys, struct ocfs2_alloc_context *data_ac, | ||
1078 | struct ocfs2_alloc_context *meta_ac, | ||
1079 | struct ocfs2_write_ctxt *wc, u32 cpos, | ||
1080 | loff_t user_pos, unsigned user_len) | ||
1081 | { | ||
1082 | int ret, i, new; | ||
1083 | u64 v_blkno, p_blkno; | ||
1084 | struct inode *inode = mapping->host; | ||
1085 | |||
1086 | new = phys == 0 ? 1 : 0; | ||
1098 | 1087 | ||
1099 | if (new) { | 1088 | if (new) { |
1089 | u32 tmp_pos; | ||
1090 | |||
1100 | /* | 1091 | /* |
1101 | * This is safe to call with the page locks - it won't take | 1092 | * This is safe to call with the page locks - it won't take |
1102 | * any additional semaphores or cluster locks. | 1093 | * any additional semaphores or cluster locks. |
1103 | */ | 1094 | */ |
1104 | tmp_pos = wc->w_cpos; | 1095 | tmp_pos = cpos; |
1105 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | 1096 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, |
1106 | &tmp_pos, 1, di_bh, handle, | 1097 | &tmp_pos, 1, wc->w_di_bh, |
1107 | data_ac, meta_ac, NULL); | 1098 | wc->w_handle, data_ac, |
1099 | meta_ac, NULL); | ||
1108 | /* | 1100 | /* |
1109 | * This shouldn't happen because we must have already | 1101 | * This shouldn't happen because we must have already |
1110 | * calculated the correct meta data allocation required. The | 1102 | * calculated the correct meta data allocation required. The |
@@ -1121,103 +1113,132 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | |||
1121 | mlog_errno(ret); | 1113 | mlog_errno(ret); |
1122 | goto out; | 1114 | goto out; |
1123 | } | 1115 | } |
1116 | |||
1117 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); | ||
1118 | } else { | ||
1119 | v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; | ||
1124 | } | 1120 | } |
1125 | 1121 | ||
1122 | /* | ||
1123 | * The only reason this should fail is due to an inability to | ||
1124 | * find the extent added. | ||
1125 | */ | ||
1126 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | 1126 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, |
1127 | NULL); | 1127 | NULL); |
1128 | if (ret < 0) { | 1128 | if (ret < 0) { |
1129 | 1129 | ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " | |
1130 | /* | 1130 | "at logical block %llu", |
1131 | * XXX: Should we go readonly here? | 1131 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1132 | */ | 1132 | (unsigned long long)v_blkno); |
1133 | |||
1134 | mlog_errno(ret); | ||
1135 | goto out; | 1133 | goto out; |
1136 | } | 1134 | } |
1137 | 1135 | ||
1138 | BUG_ON(p_blkno == 0); | 1136 | BUG_ON(p_blkno == 0); |
1139 | 1137 | ||
1140 | for(i = 0; i < numpages; i++) { | 1138 | for(i = 0; i < wc->w_num_pages; i++) { |
1141 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | 1139 | int tmpret; |
1142 | wc, new); | ||
1143 | if (ret < 0) { | ||
1144 | mlog_errno(ret); | ||
1145 | goto out; | ||
1146 | } | ||
1147 | 1140 | ||
1148 | copied += ret; | 1141 | tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, |
1142 | wc->w_pages[i], cpos, | ||
1143 | user_pos, user_len, new); | ||
1144 | if (tmpret) { | ||
1145 | mlog_errno(tmpret); | ||
1146 | if (ret == 0) | ||
1147 | tmpret = ret; | ||
1148 | } | ||
1149 | } | 1149 | } |
1150 | 1150 | ||
1151 | /* | ||
1152 | * We only have cleanup to do in case of allocating write. | ||
1153 | */ | ||
1154 | if (ret && new) | ||
1155 | ocfs2_write_failure(inode, wc, user_pos, user_len); | ||
1156 | |||
1151 | out: | 1157 | out: |
1152 | for(i = 0; i < numpages; i++) { | ||
1153 | unlock_page(cpages[i]); | ||
1154 | mark_page_accessed(cpages[i]); | ||
1155 | page_cache_release(cpages[i]); | ||
1156 | } | ||
1157 | kfree(cpages); | ||
1158 | 1158 | ||
1159 | return copied ? copied : ret; | 1159 | return ret; |
1160 | } | 1160 | } |
1161 | 1161 | ||
1162 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | 1162 | /* |
1163 | struct ocfs2_super *osb, loff_t pos, | 1163 | * ocfs2_write_end() wants to know which parts of the target page it |
1164 | size_t count, ocfs2_page_writer *cb, | 1164 | * should complete the write on. It's easiest to compute them ahead of |
1165 | void *cb_priv) | 1165 | * time when a more complete view of the write is available. |
1166 | */ | ||
1167 | static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, | ||
1168 | struct ocfs2_write_ctxt *wc, | ||
1169 | loff_t pos, unsigned len, int alloc) | ||
1166 | { | 1170 | { |
1167 | wc->w_count = count; | 1171 | struct ocfs2_write_cluster_desc *desc; |
1168 | wc->w_pos = pos; | ||
1169 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
1170 | wc->w_finished_copy = 0; | ||
1171 | 1172 | ||
1172 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | 1173 | wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); |
1173 | wc->w_large_pages = 1; | 1174 | wc->w_target_to = wc->w_target_from + len; |
1174 | else | 1175 | |
1175 | wc->w_large_pages = 0; | 1176 | if (alloc == 0) |
1177 | return; | ||
1178 | |||
1179 | /* | ||
1180 | * Allocating write - we may have different boundaries based | ||
1181 | * on page size and cluster size. | ||
1182 | * | ||
1183 | * NOTE: We can no longer compute one value from the other as | ||
1184 | * the actual write length and user provided length may be | ||
1185 | * different. | ||
1186 | */ | ||
1176 | 1187 | ||
1177 | wc->w_write_data_page = cb; | 1188 | if (wc->w_large_pages) { |
1178 | wc->w_private = cb_priv; | 1189 | /* |
1190 | * We only care about the 1st and last cluster within | ||
1191 | * our range and whether they are holes or not. Either | ||
1192 | * value may be extended out to the start/end of a | ||
1193 | * newly allocated cluster. | ||
1194 | */ | ||
1195 | desc = &wc->w_desc[0]; | ||
1196 | if (desc->c_new) | ||
1197 | ocfs2_figure_cluster_boundaries(osb, | ||
1198 | desc->c_cpos, | ||
1199 | &wc->w_target_from, | ||
1200 | NULL); | ||
1201 | |||
1202 | desc = &wc->w_desc[wc->w_clen - 1]; | ||
1203 | if (desc->c_new) | ||
1204 | ocfs2_figure_cluster_boundaries(osb, | ||
1205 | desc->c_cpos, | ||
1206 | NULL, | ||
1207 | &wc->w_target_to); | ||
1208 | } else { | ||
1209 | wc->w_target_from = 0; | ||
1210 | wc->w_target_to = PAGE_CACHE_SIZE; | ||
1211 | } | ||
1179 | } | 1212 | } |
1180 | 1213 | ||
1181 | /* | 1214 | int ocfs2_write_begin(struct file *file, struct address_space *mapping, |
1182 | * Write a cluster to an inode. The cluster may not be allocated yet, | 1215 | loff_t pos, unsigned len, unsigned flags, |
1183 | * in which case it will be. This only exists for buffered writes - | 1216 | struct page **pagep, void **fsdata) |
1184 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
1185 | * | ||
1186 | * The caller is responsible for incrementing pos, written counts, etc | ||
1187 | * | ||
1188 | * For file systems that don't support sparse files, pre-allocation | ||
1189 | * and page zeroing up until cpos should be done prior to this | ||
1190 | * function call. | ||
1191 | * | ||
1192 | * Callers should be holding i_sem, and the rw cluster lock. | ||
1193 | * | ||
1194 | * Returns the number of user bytes written, or less than zero for | ||
1195 | * error. | ||
1196 | */ | ||
1197 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
1198 | size_t count, ocfs2_page_writer *actor, | ||
1199 | void *priv) | ||
1200 | { | 1217 | { |
1201 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | 1218 | int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; |
1202 | ssize_t written = 0; | 1219 | unsigned int num_clusters = 0, clusters_to_alloc = 0; |
1203 | u32 phys; | 1220 | u32 phys = 0; |
1204 | struct inode *inode = file->f_mapping->host; | 1221 | struct ocfs2_write_ctxt *wc; |
1222 | struct inode *inode = mapping->host; | ||
1205 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1223 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1206 | struct buffer_head *di_bh = NULL; | ||
1207 | struct ocfs2_dinode *di; | 1224 | struct ocfs2_dinode *di; |
1208 | struct ocfs2_alloc_context *data_ac = NULL; | 1225 | struct ocfs2_alloc_context *data_ac = NULL; |
1209 | struct ocfs2_alloc_context *meta_ac = NULL; | 1226 | struct ocfs2_alloc_context *meta_ac = NULL; |
1210 | handle_t *handle; | 1227 | handle_t *handle; |
1211 | struct ocfs2_write_ctxt wc; | 1228 | struct ocfs2_write_cluster_desc *desc; |
1212 | 1229 | ||
1213 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | 1230 | ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len); |
1231 | if (ret) { | ||
1232 | mlog_errno(ret); | ||
1233 | return ret; | ||
1234 | } | ||
1214 | 1235 | ||
1215 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | 1236 | ret = ocfs2_meta_lock(inode, &wc->w_di_bh, 1); |
1216 | if (ret) { | 1237 | if (ret) { |
1217 | mlog_errno(ret); | 1238 | mlog_errno(ret); |
1218 | goto out; | 1239 | goto out; |
1219 | } | 1240 | } |
1220 | di = (struct ocfs2_dinode *)di_bh->b_data; | 1241 | di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; |
1221 | 1242 | ||
1222 | /* | 1243 | /* |
1223 | * Take alloc sem here to prevent concurrent lookups. That way | 1244 | * Take alloc sem here to prevent concurrent lookups. That way |
@@ -1228,23 +1249,60 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | |||
1228 | */ | 1249 | */ |
1229 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 1250 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
1230 | 1251 | ||
1231 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); | 1252 | for (i = 0; i < wc->w_clen; i++) { |
1232 | if (ret) { | 1253 | desc = &wc->w_desc[i]; |
1233 | mlog_errno(ret); | 1254 | desc->c_cpos = wc->w_cpos + i; |
1234 | goto out_meta; | 1255 | |
1256 | if (num_clusters == 0) { | ||
1257 | ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, | ||
1258 | &num_clusters, NULL); | ||
1259 | if (ret) { | ||
1260 | mlog_errno(ret); | ||
1261 | goto out_meta; | ||
1262 | } | ||
1263 | } else if (phys) { | ||
1264 | /* | ||
1265 | * Only increment phys if it doesn't describe | ||
1266 | * a hole. | ||
1267 | */ | ||
1268 | phys++; | ||
1269 | } | ||
1270 | |||
1271 | desc->c_phys = phys; | ||
1272 | if (phys == 0) { | ||
1273 | desc->c_new = 1; | ||
1274 | clusters_to_alloc++; | ||
1275 | } | ||
1276 | |||
1277 | num_clusters--; | ||
1235 | } | 1278 | } |
1236 | 1279 | ||
1237 | /* phys == 0 means that allocation is required. */ | 1280 | /* |
1238 | if (phys == 0) { | 1281 | * We set w_target_from, w_target_to here so that |
1239 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | 1282 | * ocfs2_write_end() knows which range in the target page to |
1283 | * write out. An allocation requires that we write the entire | ||
1284 | * cluster range. | ||
1285 | */ | ||
1286 | if (clusters_to_alloc > 0) { | ||
1287 | /* | ||
1288 | * XXX: We are stretching the limits of | ||
1289 | * ocfs2_lock_allocators(). It greately over-estimates | ||
1290 | * the work to be done. | ||
1291 | */ | ||
1292 | ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, | ||
1293 | &data_ac, &meta_ac); | ||
1240 | if (ret) { | 1294 | if (ret) { |
1241 | mlog_errno(ret); | 1295 | mlog_errno(ret); |
1242 | goto out_meta; | 1296 | goto out_meta; |
1243 | } | 1297 | } |
1244 | 1298 | ||
1245 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | 1299 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, |
1300 | clusters_to_alloc); | ||
1301 | |||
1246 | } | 1302 | } |
1247 | 1303 | ||
1304 | ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc); | ||
1305 | |||
1248 | ret = ocfs2_data_lock(inode, 1); | 1306 | ret = ocfs2_data_lock(inode, 1); |
1249 | if (ret) { | 1307 | if (ret) { |
1250 | mlog_errno(ret); | 1308 | mlog_errno(ret); |
@@ -1258,36 +1316,50 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | |||
1258 | goto out_data; | 1316 | goto out_data; |
1259 | } | 1317 | } |
1260 | 1318 | ||
1261 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | 1319 | wc->w_handle = handle; |
1262 | meta_ac, &wc); | 1320 | |
1263 | if (written < 0) { | 1321 | /* |
1264 | ret = written; | 1322 | * We don't want this to fail in ocfs2_write_end(), so do it |
1323 | * here. | ||
1324 | */ | ||
1325 | ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, | ||
1326 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1327 | if (ret) { | ||
1265 | mlog_errno(ret); | 1328 | mlog_errno(ret); |
1266 | goto out_commit; | 1329 | goto out_commit; |
1267 | } | 1330 | } |
1268 | 1331 | ||
1269 | ret = ocfs2_journal_access(handle, inode, di_bh, | 1332 | /* |
1270 | OCFS2_JOURNAL_ACCESS_WRITE); | 1333 | * Fill our page array first. That way we've grabbed enough so |
1334 | * that we can zero and flush if we error after adding the | ||
1335 | * extent. | ||
1336 | */ | ||
1337 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, | ||
1338 | clusters_to_alloc); | ||
1271 | if (ret) { | 1339 | if (ret) { |
1272 | mlog_errno(ret); | 1340 | mlog_errno(ret); |
1273 | goto out_commit; | 1341 | goto out_commit; |
1274 | } | 1342 | } |
1275 | 1343 | ||
1276 | pos += written; | 1344 | for (i = 0; i < wc->w_clen; i++) { |
1277 | if (pos > inode->i_size) { | 1345 | desc = &wc->w_desc[i]; |
1278 | i_size_write(inode, pos); | 1346 | |
1279 | mark_inode_dirty(inode); | 1347 | ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac, |
1348 | meta_ac, wc, desc->c_cpos, pos, len); | ||
1349 | if (ret) { | ||
1350 | mlog_errno(ret); | ||
1351 | goto out_commit; | ||
1352 | } | ||
1280 | } | 1353 | } |
1281 | inode->i_blocks = ocfs2_inode_sector_count(inode); | ||
1282 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
1283 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
1284 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
1285 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
1286 | 1354 | ||
1287 | ret = ocfs2_journal_dirty(handle, di_bh); | 1355 | if (data_ac) |
1288 | if (ret) | 1356 | ocfs2_free_alloc_context(data_ac); |
1289 | mlog_errno(ret); | 1357 | if (meta_ac) |
1358 | ocfs2_free_alloc_context(meta_ac); | ||
1290 | 1359 | ||
1360 | *pagep = wc->w_target_page; | ||
1361 | *fsdata = wc; | ||
1362 | return 0; | ||
1291 | out_commit: | 1363 | out_commit: |
1292 | ocfs2_commit_trans(osb, handle); | 1364 | ocfs2_commit_trans(osb, handle); |
1293 | 1365 | ||
@@ -1299,13 +1371,85 @@ out_meta: | |||
1299 | ocfs2_meta_unlock(inode, 1); | 1371 | ocfs2_meta_unlock(inode, 1); |
1300 | 1372 | ||
1301 | out: | 1373 | out: |
1302 | brelse(di_bh); | 1374 | ocfs2_free_write_ctxt(wc); |
1375 | |||
1303 | if (data_ac) | 1376 | if (data_ac) |
1304 | ocfs2_free_alloc_context(data_ac); | 1377 | ocfs2_free_alloc_context(data_ac); |
1305 | if (meta_ac) | 1378 | if (meta_ac) |
1306 | ocfs2_free_alloc_context(meta_ac); | 1379 | ocfs2_free_alloc_context(meta_ac); |
1380 | return ret; | ||
1381 | } | ||
1382 | |||
1383 | int ocfs2_write_end(struct file *file, struct address_space *mapping, | ||
1384 | loff_t pos, unsigned len, unsigned copied, | ||
1385 | struct page *page, void *fsdata) | ||
1386 | { | ||
1387 | int i; | ||
1388 | unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); | ||
1389 | struct inode *inode = mapping->host; | ||
1390 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1391 | struct ocfs2_write_ctxt *wc = fsdata; | ||
1392 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; | ||
1393 | handle_t *handle = wc->w_handle; | ||
1394 | struct page *tmppage; | ||
1395 | |||
1396 | if (unlikely(copied < len)) { | ||
1397 | if (!PageUptodate(wc->w_target_page)) | ||
1398 | copied = 0; | ||
1399 | |||
1400 | ocfs2_zero_new_buffers(wc->w_target_page, start+copied, | ||
1401 | start+len); | ||
1402 | } | ||
1403 | flush_dcache_page(wc->w_target_page); | ||
1404 | |||
1405 | for(i = 0; i < wc->w_num_pages; i++) { | ||
1406 | tmppage = wc->w_pages[i]; | ||
1407 | |||
1408 | if (tmppage == wc->w_target_page) { | ||
1409 | from = wc->w_target_from; | ||
1410 | to = wc->w_target_to; | ||
1411 | |||
1412 | BUG_ON(from > PAGE_CACHE_SIZE || | ||
1413 | to > PAGE_CACHE_SIZE || | ||
1414 | to < from); | ||
1415 | } else { | ||
1416 | /* | ||
1417 | * Pages adjacent to the target (if any) imply | ||
1418 | * a hole-filling write in which case we want | ||
1419 | * to flush their entire range. | ||
1420 | */ | ||
1421 | from = 0; | ||
1422 | to = PAGE_CACHE_SIZE; | ||
1423 | } | ||
1424 | |||
1425 | if (ocfs2_should_order_data(inode)) | ||
1426 | walk_page_buffers(wc->w_handle, page_buffers(tmppage), | ||
1427 | from, to, NULL, | ||
1428 | ocfs2_journal_dirty_data); | ||
1429 | |||
1430 | block_commit_write(tmppage, from, to); | ||
1431 | } | ||
1432 | |||
1433 | pos += copied; | ||
1434 | if (pos > inode->i_size) { | ||
1435 | i_size_write(inode, pos); | ||
1436 | mark_inode_dirty(inode); | ||
1437 | } | ||
1438 | inode->i_blocks = ocfs2_inode_sector_count(inode); | ||
1439 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
1440 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
1441 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
1442 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
1443 | |||
1444 | ocfs2_journal_dirty(handle, wc->w_di_bh); | ||
1445 | |||
1446 | ocfs2_commit_trans(osb, handle); | ||
1447 | ocfs2_data_unlock(inode, 1); | ||
1448 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1449 | ocfs2_meta_unlock(inode, 1); | ||
1450 | ocfs2_free_write_ctxt(wc); | ||
1307 | 1451 | ||
1308 | return written ? written : ret; | 1452 | return copied; |
1309 | } | 1453 | } |
1310 | 1454 | ||
1311 | const struct address_space_operations ocfs2_aops = { | 1455 | const struct address_space_operations ocfs2_aops = { |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 45821d479b5a..bdcdd1ae63a9 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -42,57 +42,13 @@ int walk_page_buffers( handle_t *handle, | |||
42 | int (*fn)( handle_t *handle, | 42 | int (*fn)( handle_t *handle, |
43 | struct buffer_head *bh)); | 43 | struct buffer_head *bh)); |
44 | 44 | ||
45 | struct ocfs2_write_ctxt; | 45 | int ocfs2_write_begin(struct file *file, struct address_space *mapping, |
46 | typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, | 46 | loff_t pos, unsigned len, unsigned flags, |
47 | u64 *, unsigned int *, unsigned int *); | 47 | struct page **pagep, void **fsdata); |
48 | 48 | ||
49 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | 49 | int ocfs2_write_end(struct file *file, struct address_space *mapping, |
50 | size_t count, ocfs2_page_writer *actor, | 50 | loff_t pos, unsigned len, unsigned copied, |
51 | void *priv); | 51 | struct page *page, void *fsdata); |
52 | |||
53 | struct ocfs2_write_ctxt { | ||
54 | size_t w_count; | ||
55 | loff_t w_pos; | ||
56 | u32 w_cpos; | ||
57 | unsigned int w_finished_copy; | ||
58 | |||
59 | /* This is true if page_size > cluster_size */ | ||
60 | unsigned int w_large_pages; | ||
61 | |||
62 | /* Filler callback and private data */ | ||
63 | ocfs2_page_writer *w_write_data_page; | ||
64 | void *w_private; | ||
65 | |||
66 | /* Only valid for the filler callback */ | ||
67 | struct page *w_this_page; | ||
68 | unsigned int w_this_page_new; | ||
69 | }; | ||
70 | |||
71 | struct ocfs2_buffered_write_priv { | ||
72 | char *b_src_buf; | ||
73 | const struct iovec *b_cur_iov; /* Current iovec */ | ||
74 | size_t b_cur_off; /* Offset in the | ||
75 | * current iovec */ | ||
76 | }; | ||
77 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
78 | struct ocfs2_write_ctxt *wc, | ||
79 | u64 *p_blkno, | ||
80 | unsigned int *ret_from, | ||
81 | unsigned int *ret_to); | ||
82 | |||
83 | struct ocfs2_splice_write_priv { | ||
84 | struct splice_desc *s_sd; | ||
85 | struct pipe_buffer *s_buf; | ||
86 | struct pipe_inode_info *s_pipe; | ||
87 | /* Neither offset value is ever larger than one page */ | ||
88 | unsigned int s_offset; | ||
89 | unsigned int s_buf_offset; | ||
90 | }; | ||
91 | int ocfs2_map_and_write_splice_data(struct inode *inode, | ||
92 | struct ocfs2_write_ctxt *wc, | ||
93 | u64 *p_blkno, | ||
94 | unsigned int *ret_from, | ||
95 | unsigned int *ret_to); | ||
96 | 52 | ||
97 | /* all ocfs2_dio_end_io()'s fault */ | 53 | /* all ocfs2_dio_end_io()'s fault */ |
98 | #define ocfs2_iocb_is_rw_locked(iocb) \ | 54 | #define ocfs2_iocb_is_rw_locked(iocb) \ |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 566f9b70ec91..4c850d00c269 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -1335,15 +1335,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | |||
1335 | *basep = base; | 1335 | *basep = base; |
1336 | } | 1336 | } |
1337 | 1337 | ||
1338 | static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, | 1338 | static struct page * ocfs2_get_write_source(char **ret_src_buf, |
1339 | const struct iovec *cur_iov, | 1339 | const struct iovec *cur_iov, |
1340 | size_t iov_offset) | 1340 | size_t iov_offset) |
1341 | { | 1341 | { |
1342 | int ret; | 1342 | int ret; |
1343 | char *buf; | 1343 | char *buf = cur_iov->iov_base + iov_offset; |
1344 | struct page *src_page = NULL; | 1344 | struct page *src_page = NULL; |
1345 | unsigned long off; | ||
1345 | 1346 | ||
1346 | buf = cur_iov->iov_base + iov_offset; | 1347 | off = (unsigned long)(buf) & ~PAGE_CACHE_MASK; |
1347 | 1348 | ||
1348 | if (!segment_eq(get_fs(), KERNEL_DS)) { | 1349 | if (!segment_eq(get_fs(), KERNEL_DS)) { |
1349 | /* | 1350 | /* |
@@ -1355,18 +1356,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp | |||
1355 | (unsigned long)buf & PAGE_CACHE_MASK, 1, | 1356 | (unsigned long)buf & PAGE_CACHE_MASK, 1, |
1356 | 0, 0, &src_page, NULL); | 1357 | 0, 0, &src_page, NULL); |
1357 | if (ret == 1) | 1358 | if (ret == 1) |
1358 | bp->b_src_buf = kmap(src_page); | 1359 | *ret_src_buf = kmap(src_page) + off; |
1359 | else | 1360 | else |
1360 | src_page = ERR_PTR(-EFAULT); | 1361 | src_page = ERR_PTR(-EFAULT); |
1361 | } else { | 1362 | } else { |
1362 | bp->b_src_buf = buf; | 1363 | *ret_src_buf = buf; |
1363 | } | 1364 | } |
1364 | 1365 | ||
1365 | return src_page; | 1366 | return src_page; |
1366 | } | 1367 | } |
1367 | 1368 | ||
1368 | static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, | 1369 | static void ocfs2_put_write_source(struct page *page) |
1369 | struct page *page) | ||
1370 | { | 1370 | { |
1371 | if (page) { | 1371 | if (page) { |
1372 | kunmap(page); | 1372 | kunmap(page); |
@@ -1382,10 +1382,12 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | |||
1382 | { | 1382 | { |
1383 | int ret = 0; | 1383 | int ret = 0; |
1384 | ssize_t copied, total = 0; | 1384 | ssize_t copied, total = 0; |
1385 | size_t iov_offset = 0; | 1385 | size_t iov_offset = 0, bytes; |
1386 | loff_t pos; | ||
1386 | const struct iovec *cur_iov = iov; | 1387 | const struct iovec *cur_iov = iov; |
1387 | struct ocfs2_buffered_write_priv bp; | 1388 | struct page *user_page, *page; |
1388 | struct page *page; | 1389 | char *buf, *dst; |
1390 | void *fsdata; | ||
1389 | 1391 | ||
1390 | /* | 1392 | /* |
1391 | * handle partial DIO write. Adjust cur_iov if needed. | 1393 | * handle partial DIO write. Adjust cur_iov if needed. |
@@ -1393,21 +1395,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | |||
1393 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); | 1395 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); |
1394 | 1396 | ||
1395 | do { | 1397 | do { |
1396 | bp.b_cur_off = iov_offset; | 1398 | pos = *ppos; |
1397 | bp.b_cur_iov = cur_iov; | ||
1398 | 1399 | ||
1399 | page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); | 1400 | user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset); |
1400 | if (IS_ERR(page)) { | 1401 | if (IS_ERR(user_page)) { |
1401 | ret = PTR_ERR(page); | 1402 | ret = PTR_ERR(user_page); |
1402 | goto out; | 1403 | goto out; |
1403 | } | 1404 | } |
1404 | 1405 | ||
1405 | copied = ocfs2_buffered_write_cluster(file, *ppos, count, | 1406 | /* Stay within our page boundaries */ |
1406 | ocfs2_map_and_write_user_data, | 1407 | bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)), |
1407 | &bp); | 1408 | (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK))); |
1409 | /* Stay within the vector boundary */ | ||
1410 | bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset); | ||
1411 | /* Stay within count */ | ||
1412 | bytes = min(bytes, count); | ||
1413 | |||
1414 | page = NULL; | ||
1415 | ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0, | ||
1416 | &page, &fsdata); | ||
1417 | if (ret) { | ||
1418 | mlog_errno(ret); | ||
1419 | goto out; | ||
1420 | } | ||
1408 | 1421 | ||
1409 | ocfs2_put_write_source(&bp, page); | 1422 | dst = kmap_atomic(page, KM_USER0); |
1423 | memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes); | ||
1424 | kunmap_atomic(dst, KM_USER0); | ||
1425 | flush_dcache_page(page); | ||
1426 | ocfs2_put_write_source(user_page); | ||
1410 | 1427 | ||
1428 | copied = ocfs2_write_end(file, file->f_mapping, pos, bytes, | ||
1429 | bytes, page, fsdata); | ||
1411 | if (copied < 0) { | 1430 | if (copied < 0) { |
1412 | mlog_errno(copied); | 1431 | mlog_errno(copied); |
1413 | ret = copied; | 1432 | ret = copied; |
@@ -1415,7 +1434,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | |||
1415 | } | 1434 | } |
1416 | 1435 | ||
1417 | total += copied; | 1436 | total += copied; |
1418 | *ppos = *ppos + copied; | 1437 | *ppos = pos + copied; |
1419 | count -= copied; | 1438 | count -= copied; |
1420 | 1439 | ||
1421 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); | 1440 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); |
@@ -1585,52 +1604,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, | |||
1585 | struct pipe_buffer *buf, | 1604 | struct pipe_buffer *buf, |
1586 | struct splice_desc *sd) | 1605 | struct splice_desc *sd) |
1587 | { | 1606 | { |
1588 | int ret, count, total = 0; | 1607 | int ret, count; |
1589 | ssize_t copied = 0; | 1608 | ssize_t copied = 0; |
1590 | struct ocfs2_splice_write_priv sp; | 1609 | struct file *file = sd->u.file; |
1610 | unsigned int offset; | ||
1611 | struct page *page = NULL; | ||
1612 | void *fsdata; | ||
1613 | char *src, *dst; | ||
1591 | 1614 | ||
1592 | ret = buf->ops->confirm(pipe, buf); | 1615 | ret = buf->ops->confirm(pipe, buf); |
1593 | if (ret) | 1616 | if (ret) |
1594 | goto out; | 1617 | goto out; |
1595 | 1618 | ||
1596 | sp.s_sd = sd; | 1619 | offset = sd->pos & ~PAGE_CACHE_MASK; |
1597 | sp.s_buf = buf; | ||
1598 | sp.s_pipe = pipe; | ||
1599 | sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; | ||
1600 | sp.s_buf_offset = buf->offset; | ||
1601 | |||
1602 | count = sd->len; | 1620 | count = sd->len; |
1603 | if (count + sp.s_offset > PAGE_CACHE_SIZE) | 1621 | if (count + offset > PAGE_CACHE_SIZE) |
1604 | count = PAGE_CACHE_SIZE - sp.s_offset; | 1622 | count = PAGE_CACHE_SIZE - offset; |
1605 | 1623 | ||
1606 | do { | 1624 | ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0, |
1607 | /* | 1625 | &page, &fsdata); |
1608 | * splice wants us to copy up to one page at a | 1626 | if (ret) { |
1609 | * time. For pagesize > cluster size, this means we | 1627 | mlog_errno(ret); |
1610 | * might enter ocfs2_buffered_write_cluster() more | 1628 | goto out; |
1611 | * than once, so keep track of our progress here. | 1629 | } |
1612 | */ | ||
1613 | copied = ocfs2_buffered_write_cluster(sd->u.file, | ||
1614 | (loff_t)sd->pos + total, | ||
1615 | count, | ||
1616 | ocfs2_map_and_write_splice_data, | ||
1617 | &sp); | ||
1618 | if (copied < 0) { | ||
1619 | mlog_errno(copied); | ||
1620 | ret = copied; | ||
1621 | goto out; | ||
1622 | } | ||
1623 | 1630 | ||
1624 | count -= copied; | 1631 | src = buf->ops->map(pipe, buf, 1); |
1625 | sp.s_offset += copied; | 1632 | dst = kmap_atomic(page, KM_USER1); |
1626 | sp.s_buf_offset += copied; | 1633 | memcpy(dst + offset, src + buf->offset, count); |
1627 | total += copied; | 1634 | kunmap_atomic(page, KM_USER1); |
1628 | } while (count); | 1635 | buf->ops->unmap(pipe, buf, src); |
1629 | 1636 | ||
1630 | ret = 0; | 1637 | copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count, |
1638 | page, fsdata); | ||
1639 | if (copied < 0) { | ||
1640 | mlog_errno(copied); | ||
1641 | ret = copied; | ||
1642 | goto out; | ||
1643 | } | ||
1631 | out: | 1644 | out: |
1632 | 1645 | ||
1633 | return total ? total : ret; | 1646 | return copied ? copied : ret; |
1634 | } | 1647 | } |
1635 | 1648 | ||
1636 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, | 1649 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, |