diff options
Diffstat (limited to 'fs/ocfs2/aops.c')
-rw-r--r-- | fs/ocfs2/aops.c | 1015 |
1 files changed, 658 insertions, 357 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a480b09c79b9..84bf6e79de23 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
684 | bh = bh->b_this_page, block_start += bsize) { | 684 | bh = bh->b_this_page, block_start += bsize) { |
685 | block_end = block_start + bsize; | 685 | block_end = block_start + bsize; |
686 | 686 | ||
687 | clear_buffer_new(bh); | ||
688 | |||
687 | /* | 689 | /* |
688 | * Ignore blocks outside of our i/o range - | 690 | * Ignore blocks outside of our i/o range - |
689 | * they may belong to unallocated clusters. | 691 | * they may belong to unallocated clusters. |
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
698 | * For an allocating write with cluster size >= page | 700 | * For an allocating write with cluster size >= page |
699 | * size, we always write the entire page. | 701 | * size, we always write the entire page. |
700 | */ | 702 | */ |
701 | 703 | if (new) | |
702 | if (buffer_new(bh)) | 704 | set_buffer_new(bh); |
703 | clear_buffer_new(bh); | ||
704 | 705 | ||
705 | if (!buffer_mapped(bh)) { | 706 | if (!buffer_mapped(bh)) { |
706 | map_bh(bh, inode->i_sb, *p_blkno); | 707 | map_bh(bh, inode->i_sb, *p_blkno); |
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
711 | if (!buffer_uptodate(bh)) | 712 | if (!buffer_uptodate(bh)) |
712 | set_buffer_uptodate(bh); | 713 | set_buffer_uptodate(bh); |
713 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && | 714 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && |
714 | (block_start < from || block_end > to)) { | 715 | !buffer_new(bh) && |
716 | (block_start < from || block_end > to)) { | ||
715 | ll_rw_block(READ, 1, &bh); | 717 | ll_rw_block(READ, 1, &bh); |
716 | *wait_bh++=bh; | 718 | *wait_bh++=bh; |
717 | } | 719 | } |
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
738 | bh = head; | 740 | bh = head; |
739 | block_start = 0; | 741 | block_start = 0; |
740 | do { | 742 | do { |
741 | void *kaddr; | ||
742 | |||
743 | block_end = block_start + bsize; | 743 | block_end = block_start + bsize; |
744 | if (block_end <= from) | 744 | if (block_end <= from) |
745 | goto next_bh; | 745 | goto next_bh; |
746 | if (block_start >= to) | 746 | if (block_start >= to) |
747 | break; | 747 | break; |
748 | 748 | ||
749 | kaddr = kmap_atomic(page, KM_USER0); | 749 | zero_user_page(page, block_start, bh->b_size, KM_USER0); |
750 | memset(kaddr+block_start, 0, bh->b_size); | ||
751 | flush_dcache_page(page); | ||
752 | kunmap_atomic(kaddr, KM_USER0); | ||
753 | set_buffer_uptodate(bh); | 750 | set_buffer_uptodate(bh); |
754 | mark_buffer_dirty(bh); | 751 | mark_buffer_dirty(bh); |
755 | 752 | ||
@@ -761,217 +758,240 @@ next_bh: | |||
761 | return ret; | 758 | return ret; |
762 | } | 759 | } |
763 | 760 | ||
761 | #if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) | ||
762 | #define OCFS2_MAX_CTXT_PAGES 1 | ||
763 | #else | ||
764 | #define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) | ||
765 | #endif | ||
766 | |||
767 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) | ||
768 | |||
764 | /* | 769 | /* |
765 | * This will copy user data from the buffer page in the splice | 770 | * Describe the state of a single cluster to be written to. |
766 | * context. | ||
767 | * | ||
768 | * For now, we ignore SPLICE_F_MOVE as that would require some extra | ||
769 | * communication out all the way to ocfs2_write(). | ||
770 | */ | 771 | */ |
771 | int ocfs2_map_and_write_splice_data(struct inode *inode, | 772 | struct ocfs2_write_cluster_desc { |
772 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | 773 | u32 c_cpos; |
773 | unsigned int *ret_from, unsigned int *ret_to) | 774 | u32 c_phys; |
775 | /* | ||
776 | * Give this a unique field because c_phys eventually gets | ||
777 | * filled. | ||
778 | */ | ||
779 | unsigned c_new; | ||
780 | unsigned c_unwritten; | ||
781 | }; | ||
782 | |||
783 | static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) | ||
774 | { | 784 | { |
775 | int ret; | 785 | return d->c_new || d->c_unwritten; |
776 | unsigned int to, from, cluster_start, cluster_end; | 786 | } |
777 | char *src, *dst; | ||
778 | struct ocfs2_splice_write_priv *sp = wc->w_private; | ||
779 | struct pipe_buffer *buf = sp->s_buf; | ||
780 | unsigned long bytes, src_from; | ||
781 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
782 | 787 | ||
783 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | 788 | struct ocfs2_write_ctxt { |
784 | &cluster_end); | 789 | /* Logical cluster position / len of write */ |
790 | u32 w_cpos; | ||
791 | u32 w_clen; | ||
785 | 792 | ||
786 | from = sp->s_offset; | 793 | struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; |
787 | src_from = sp->s_buf_offset; | ||
788 | bytes = wc->w_count; | ||
789 | 794 | ||
790 | if (wc->w_large_pages) { | 795 | /* |
791 | /* | 796 | * This is true if page_size > cluster_size. |
792 | * For cluster size < page size, we have to | 797 | * |
793 | * calculate pos within the cluster and obey | 798 | * It triggers a set of special cases during write which might |
794 | * the rightmost boundary. | 799 | * have to deal with allocating writes to partial pages. |
795 | */ | 800 | */ |
796 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | 801 | unsigned int w_large_pages; |
797 | - (wc->w_pos & (osb->s_clustersize - 1)))); | 802 | |
798 | } | 803 | /* |
799 | to = from + bytes; | 804 | * Pages involved in this write. |
805 | * | ||
806 | * w_target_page is the page being written to by the user. | ||
807 | * | ||
808 | * w_pages is an array of pages which always contains | ||
809 | * w_target_page, and in the case of an allocating write with | ||
810 | * page_size < cluster size, it will contain zero'd and mapped | ||
811 | * pages adjacent to w_target_page which need to be written | ||
812 | * out in so that future reads from that region will get | ||
813 | * zero's. | ||
814 | */ | ||
815 | struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; | ||
816 | unsigned int w_num_pages; | ||
817 | struct page *w_target_page; | ||
800 | 818 | ||
801 | BUG_ON(from > PAGE_CACHE_SIZE); | 819 | /* |
802 | BUG_ON(to > PAGE_CACHE_SIZE); | 820 | * ocfs2_write_end() uses this to know what the real range to |
803 | BUG_ON(from < cluster_start); | 821 | * write in the target should be. |
804 | BUG_ON(to > cluster_end); | 822 | */ |
823 | unsigned int w_target_from; | ||
824 | unsigned int w_target_to; | ||
805 | 825 | ||
806 | if (wc->w_this_page_new) | 826 | /* |
807 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 827 | * We could use journal_current_handle() but this is cleaner, |
808 | cluster_start, cluster_end, 1); | 828 | * IMHO -Mark |
809 | else | 829 | */ |
810 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 830 | handle_t *w_handle; |
811 | from, to, 0); | 831 | |
812 | if (ret) { | 832 | struct buffer_head *w_di_bh; |
813 | mlog_errno(ret); | 833 | |
814 | goto out; | 834 | struct ocfs2_cached_dealloc_ctxt w_dealloc; |
835 | }; | ||
836 | |||
837 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | ||
838 | { | ||
839 | int i; | ||
840 | |||
841 | for(i = 0; i < wc->w_num_pages; i++) { | ||
842 | if (wc->w_pages[i] == NULL) | ||
843 | continue; | ||
844 | |||
845 | unlock_page(wc->w_pages[i]); | ||
846 | mark_page_accessed(wc->w_pages[i]); | ||
847 | page_cache_release(wc->w_pages[i]); | ||
815 | } | 848 | } |
816 | 849 | ||
817 | src = buf->ops->map(sp->s_pipe, buf, 1); | 850 | brelse(wc->w_di_bh); |
818 | dst = kmap_atomic(wc->w_this_page, KM_USER1); | 851 | kfree(wc); |
819 | memcpy(dst + from, src + src_from, bytes); | 852 | } |
820 | kunmap_atomic(wc->w_this_page, KM_USER1); | 853 | |
821 | buf->ops->unmap(sp->s_pipe, buf, src); | 854 | static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, |
855 | struct ocfs2_super *osb, loff_t pos, | ||
856 | unsigned len, struct buffer_head *di_bh) | ||
857 | { | ||
858 | struct ocfs2_write_ctxt *wc; | ||
859 | |||
860 | wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); | ||
861 | if (!wc) | ||
862 | return -ENOMEM; | ||
822 | 863 | ||
823 | wc->w_finished_copy = 1; | 864 | wc->w_cpos = pos >> osb->s_clustersize_bits; |
865 | wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len); | ||
866 | get_bh(di_bh); | ||
867 | wc->w_di_bh = di_bh; | ||
824 | 868 | ||
825 | *ret_from = from; | 869 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) |
826 | *ret_to = to; | 870 | wc->w_large_pages = 1; |
827 | out: | 871 | else |
872 | wc->w_large_pages = 0; | ||
873 | |||
874 | ocfs2_init_dealloc_ctxt(&wc->w_dealloc); | ||
875 | |||
876 | *wcp = wc; | ||
828 | 877 | ||
829 | return bytes ? (unsigned int)bytes : ret; | 878 | return 0; |
830 | } | 879 | } |
831 | 880 | ||
832 | /* | 881 | /* |
833 | * This will copy user data from the iovec in the buffered write | 882 | * If a page has any new buffers, zero them out here, and mark them uptodate |
834 | * context. | 883 | * and dirty so they'll be written out (in order to prevent uninitialised |
884 | * block data from leaking). And clear the new bit. | ||
835 | */ | 885 | */ |
836 | int ocfs2_map_and_write_user_data(struct inode *inode, | 886 | static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) |
837 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
838 | unsigned int *ret_from, unsigned int *ret_to) | ||
839 | { | 887 | { |
840 | int ret; | 888 | unsigned int block_start, block_end; |
841 | unsigned int to, from, cluster_start, cluster_end; | 889 | struct buffer_head *head, *bh; |
842 | unsigned long bytes, src_from; | ||
843 | char *dst; | ||
844 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
845 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
846 | char __user *buf; | ||
847 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
848 | 890 | ||
849 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | 891 | BUG_ON(!PageLocked(page)); |
850 | &cluster_end); | 892 | if (!page_has_buffers(page)) |
893 | return; | ||
851 | 894 | ||
852 | buf = cur_iov->iov_base + bp->b_cur_off; | 895 | bh = head = page_buffers(page); |
853 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | 896 | block_start = 0; |
897 | do { | ||
898 | block_end = block_start + bh->b_size; | ||
854 | 899 | ||
855 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | 900 | if (buffer_new(bh)) { |
901 | if (block_end > from && block_start < to) { | ||
902 | if (!PageUptodate(page)) { | ||
903 | unsigned start, end; | ||
856 | 904 | ||
857 | /* | 905 | start = max(from, block_start); |
858 | * This is a lot of comparisons, but it reads quite | 906 | end = min(to, block_end); |
859 | * easily, which is important here. | ||
860 | */ | ||
861 | /* Stay within the src page */ | ||
862 | bytes = PAGE_SIZE - src_from; | ||
863 | /* Stay within the vector */ | ||
864 | bytes = min(bytes, | ||
865 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
866 | /* Stay within count */ | ||
867 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
868 | /* | ||
869 | * For clustersize > page size, just stay within | ||
870 | * target page, otherwise we have to calculate pos | ||
871 | * within the cluster and obey the rightmost | ||
872 | * boundary. | ||
873 | */ | ||
874 | if (wc->w_large_pages) { | ||
875 | /* | ||
876 | * For cluster size < page size, we have to | ||
877 | * calculate pos within the cluster and obey | ||
878 | * the rightmost boundary. | ||
879 | */ | ||
880 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
881 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
882 | } else { | ||
883 | /* | ||
884 | * cluster size > page size is the most common | ||
885 | * case - we just stay within the target page | ||
886 | * boundary. | ||
887 | */ | ||
888 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
889 | } | ||
890 | 907 | ||
891 | to = from + bytes; | 908 | zero_user_page(page, start, end - start, KM_USER0); |
909 | set_buffer_uptodate(bh); | ||
910 | } | ||
892 | 911 | ||
893 | BUG_ON(from > PAGE_CACHE_SIZE); | 912 | clear_buffer_new(bh); |
894 | BUG_ON(to > PAGE_CACHE_SIZE); | 913 | mark_buffer_dirty(bh); |
895 | BUG_ON(from < cluster_start); | 914 | } |
896 | BUG_ON(to > cluster_end); | 915 | } |
897 | 916 | ||
898 | if (wc->w_this_page_new) | 917 | block_start = block_end; |
899 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 918 | bh = bh->b_this_page; |
900 | cluster_start, cluster_end, 1); | 919 | } while (bh != head); |
901 | else | 920 | } |
902 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
903 | from, to, 0); | ||
904 | if (ret) { | ||
905 | mlog_errno(ret); | ||
906 | goto out; | ||
907 | } | ||
908 | 921 | ||
909 | dst = kmap(wc->w_this_page); | 922 | /* |
910 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | 923 | * Only called when we have a failure during allocating write to write |
911 | kunmap(wc->w_this_page); | 924 | * zero's to the newly allocated region. |
925 | */ | ||
926 | static void ocfs2_write_failure(struct inode *inode, | ||
927 | struct ocfs2_write_ctxt *wc, | ||
928 | loff_t user_pos, unsigned user_len) | ||
929 | { | ||
930 | int i; | ||
931 | unsigned from, to; | ||
932 | struct page *tmppage; | ||
912 | 933 | ||
913 | /* | 934 | ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len); |
914 | * XXX: This is slow, but simple. The caller of | ||
915 | * ocfs2_buffered_write_cluster() is responsible for | ||
916 | * passing through the iovecs, so it's difficult to | ||
917 | * predict what our next step is in here after our | ||
918 | * initial write. A future version should be pushing | ||
919 | * that iovec manipulation further down. | ||
920 | * | ||
921 | * By setting this, we indicate that a copy from user | ||
922 | * data was done, and subsequent calls for this | ||
923 | * cluster will skip copying more data. | ||
924 | */ | ||
925 | wc->w_finished_copy = 1; | ||
926 | 935 | ||
927 | *ret_from = from; | 936 | if (wc->w_large_pages) { |
928 | *ret_to = to; | 937 | from = wc->w_target_from; |
929 | out: | 938 | to = wc->w_target_to; |
939 | } else { | ||
940 | from = 0; | ||
941 | to = PAGE_CACHE_SIZE; | ||
942 | } | ||
943 | |||
944 | for(i = 0; i < wc->w_num_pages; i++) { | ||
945 | tmppage = wc->w_pages[i]; | ||
930 | 946 | ||
931 | return bytes ? (unsigned int)bytes : ret; | 947 | if (ocfs2_should_order_data(inode)) |
948 | walk_page_buffers(wc->w_handle, page_buffers(tmppage), | ||
949 | from, to, NULL, | ||
950 | ocfs2_journal_dirty_data); | ||
951 | |||
952 | block_commit_write(tmppage, from, to); | ||
953 | } | ||
932 | } | 954 | } |
933 | 955 | ||
934 | /* | 956 | static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, |
935 | * Map, fill and write a page to disk. | 957 | struct ocfs2_write_ctxt *wc, |
936 | * | 958 | struct page *page, u32 cpos, |
937 | * The work of copying data is done via callback. Newly allocated | 959 | loff_t user_pos, unsigned user_len, |
938 | * pages which don't take user data will be zero'd (set 'new' to | 960 | int new) |
939 | * indicate an allocating write) | ||
940 | * | ||
941 | * Returns a negative error code or the number of bytes copied into | ||
942 | * the page. | ||
943 | */ | ||
944 | static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
945 | u64 *p_blkno, struct page *page, | ||
946 | struct ocfs2_write_ctxt *wc, int new) | ||
947 | { | 961 | { |
948 | int ret, copied = 0; | 962 | int ret; |
949 | unsigned int from = 0, to = 0; | 963 | unsigned int map_from = 0, map_to = 0; |
950 | unsigned int cluster_start, cluster_end; | 964 | unsigned int cluster_start, cluster_end; |
951 | unsigned int zero_from = 0, zero_to = 0; | 965 | unsigned int user_data_from = 0, user_data_to = 0; |
952 | 966 | ||
953 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | 967 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, |
954 | &cluster_start, &cluster_end); | 968 | &cluster_start, &cluster_end); |
955 | 969 | ||
956 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | 970 | if (page == wc->w_target_page) { |
957 | && !wc->w_finished_copy) { | 971 | map_from = user_pos & (PAGE_CACHE_SIZE - 1); |
958 | 972 | map_to = map_from + user_len; | |
959 | wc->w_this_page = page; | 973 | |
960 | wc->w_this_page_new = new; | 974 | if (new) |
961 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | 975 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, |
962 | if (ret < 0) { | 976 | cluster_start, cluster_end, |
977 | new); | ||
978 | else | ||
979 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
980 | map_from, map_to, new); | ||
981 | if (ret) { | ||
963 | mlog_errno(ret); | 982 | mlog_errno(ret); |
964 | goto out; | 983 | goto out; |
965 | } | 984 | } |
966 | 985 | ||
967 | copied = ret; | 986 | user_data_from = map_from; |
968 | 987 | user_data_to = map_to; | |
969 | zero_from = from; | ||
970 | zero_to = to; | ||
971 | if (new) { | 988 | if (new) { |
972 | from = cluster_start; | 989 | map_from = cluster_start; |
973 | to = cluster_end; | 990 | map_to = cluster_end; |
974 | } | 991 | } |
992 | |||
993 | wc->w_target_from = map_from; | ||
994 | wc->w_target_to = map_to; | ||
975 | } else { | 995 | } else { |
976 | /* | 996 | /* |
977 | * If we haven't allocated the new page yet, we | 997 | * If we haven't allocated the new page yet, we |
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | |||
980 | */ | 1000 | */ |
981 | BUG_ON(!new); | 1001 | BUG_ON(!new); |
982 | 1002 | ||
983 | from = cluster_start; | 1003 | map_from = cluster_start; |
984 | to = cluster_end; | 1004 | map_to = cluster_end; |
985 | 1005 | ||
986 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | 1006 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, |
987 | cluster_start, cluster_end, 1); | 1007 | cluster_start, cluster_end, new); |
988 | if (ret) { | 1008 | if (ret) { |
989 | mlog_errno(ret); | 1009 | mlog_errno(ret); |
990 | goto out; | 1010 | goto out; |
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | |||
1003 | */ | 1023 | */ |
1004 | if (new && !PageUptodate(page)) | 1024 | if (new && !PageUptodate(page)) |
1005 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | 1025 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), |
1006 | wc->w_cpos, zero_from, zero_to); | 1026 | cpos, user_data_from, user_data_to); |
1007 | 1027 | ||
1008 | flush_dcache_page(page); | 1028 | flush_dcache_page(page); |
1009 | 1029 | ||
1010 | if (ocfs2_should_order_data(inode)) { | ||
1011 | ret = walk_page_buffers(handle, | ||
1012 | page_buffers(page), | ||
1013 | from, to, NULL, | ||
1014 | ocfs2_journal_dirty_data); | ||
1015 | if (ret < 0) | ||
1016 | mlog_errno(ret); | ||
1017 | } | ||
1018 | |||
1019 | /* | ||
1020 | * We don't use generic_commit_write() because we need to | ||
1021 | * handle our own i_size update. | ||
1022 | */ | ||
1023 | ret = block_commit_write(page, from, to); | ||
1024 | if (ret) | ||
1025 | mlog_errno(ret); | ||
1026 | out: | 1030 | out: |
1027 | 1031 | return ret; | |
1028 | return copied ? copied : ret; | ||
1029 | } | 1032 | } |
1030 | 1033 | ||
1031 | /* | 1034 | /* |
1032 | * Do the actual write of some data into an inode. Optionally allocate | 1035 | * This function will only grab one clusters worth of pages. |
1033 | * in order to fulfill the write. | ||
1034 | * | ||
1035 | * cpos is the logical cluster offset within the file to write at | ||
1036 | * | ||
1037 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
1038 | * zero indicates that allocation is required. In this case, data_ac | ||
1039 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
1040 | * allocation isn't required). | ||
1041 | */ | 1036 | */ |
1042 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | 1037 | static int ocfs2_grab_pages_for_write(struct address_space *mapping, |
1043 | struct buffer_head *di_bh, | 1038 | struct ocfs2_write_ctxt *wc, |
1044 | struct ocfs2_alloc_context *data_ac, | 1039 | u32 cpos, loff_t user_pos, int new, |
1045 | struct ocfs2_alloc_context *meta_ac, | 1040 | struct page *mmap_page) |
1046 | struct ocfs2_write_ctxt *wc) | ||
1047 | { | 1041 | { |
1048 | int ret, i, numpages = 1, new; | 1042 | int ret = 0, i; |
1049 | unsigned int copied = 0; | 1043 | unsigned long start, target_index, index; |
1050 | u32 tmp_pos; | ||
1051 | u64 v_blkno, p_blkno; | ||
1052 | struct address_space *mapping = file->f_mapping; | ||
1053 | struct inode *inode = mapping->host; | 1044 | struct inode *inode = mapping->host; |
1054 | unsigned long index, start; | ||
1055 | struct page **cpages; | ||
1056 | 1045 | ||
1057 | new = phys == 0 ? 1 : 0; | 1046 | target_index = user_pos >> PAGE_CACHE_SHIFT; |
1058 | 1047 | ||
1059 | /* | 1048 | /* |
1060 | * Figure out how many pages we'll be manipulating here. For | 1049 | * Figure out how many pages we'll be manipulating here. For |
1061 | * non allocating write, we just change the one | 1050 | * non allocating write, we just change the one |
1062 | * page. Otherwise, we'll need a whole clusters worth. | 1051 | * page. Otherwise, we'll need a whole clusters worth. |
1063 | */ | 1052 | */ |
1064 | if (new) | ||
1065 | numpages = ocfs2_pages_per_cluster(inode->i_sb); | ||
1066 | |||
1067 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
1068 | if (!cpages) { | ||
1069 | ret = -ENOMEM; | ||
1070 | mlog_errno(ret); | ||
1071 | return ret; | ||
1072 | } | ||
1073 | |||
1074 | /* | ||
1075 | * Fill our page array first. That way we've grabbed enough so | ||
1076 | * that we can zero and flush if we error after adding the | ||
1077 | * extent. | ||
1078 | */ | ||
1079 | if (new) { | 1053 | if (new) { |
1080 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | 1054 | wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); |
1081 | wc->w_cpos); | 1055 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); |
1082 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
1083 | } else { | 1056 | } else { |
1084 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | 1057 | wc->w_num_pages = 1; |
1085 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | 1058 | start = target_index; |
1086 | } | 1059 | } |
1087 | 1060 | ||
1088 | for(i = 0; i < numpages; i++) { | 1061 | for(i = 0; i < wc->w_num_pages; i++) { |
1089 | index = start + i; | 1062 | index = start + i; |
1090 | 1063 | ||
1091 | cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); | 1064 | if (index == target_index && mmap_page) { |
1092 | if (!cpages[i]) { | 1065 | /* |
1093 | ret = -ENOMEM; | 1066 | * ocfs2_pagemkwrite() is a little different |
1094 | mlog_errno(ret); | 1067 | * and wants us to directly use the page |
1095 | goto out; | 1068 | * passed in. |
1069 | */ | ||
1070 | lock_page(mmap_page); | ||
1071 | |||
1072 | if (mmap_page->mapping != mapping) { | ||
1073 | unlock_page(mmap_page); | ||
1074 | /* | ||
1075 | * Sanity check - the locking in | ||
1076 | * ocfs2_pagemkwrite() should ensure | ||
1077 | * that this code doesn't trigger. | ||
1078 | */ | ||
1079 | ret = -EINVAL; | ||
1080 | mlog_errno(ret); | ||
1081 | goto out; | ||
1082 | } | ||
1083 | |||
1084 | page_cache_get(mmap_page); | ||
1085 | wc->w_pages[i] = mmap_page; | ||
1086 | } else { | ||
1087 | wc->w_pages[i] = find_or_create_page(mapping, index, | ||
1088 | GFP_NOFS); | ||
1089 | if (!wc->w_pages[i]) { | ||
1090 | ret = -ENOMEM; | ||
1091 | mlog_errno(ret); | ||
1092 | goto out; | ||
1093 | } | ||
1096 | } | 1094 | } |
1095 | |||
1096 | if (index == target_index) | ||
1097 | wc->w_target_page = wc->w_pages[i]; | ||
1097 | } | 1098 | } |
1099 | out: | ||
1100 | return ret; | ||
1101 | } | ||
1102 | |||
1103 | /* | ||
1104 | * Prepare a single cluster for write one cluster into the file. | ||
1105 | */ | ||
1106 | static int ocfs2_write_cluster(struct address_space *mapping, | ||
1107 | u32 phys, unsigned int unwritten, | ||
1108 | struct ocfs2_alloc_context *data_ac, | ||
1109 | struct ocfs2_alloc_context *meta_ac, | ||
1110 | struct ocfs2_write_ctxt *wc, u32 cpos, | ||
1111 | loff_t user_pos, unsigned user_len) | ||
1112 | { | ||
1113 | int ret, i, new, should_zero = 0; | ||
1114 | u64 v_blkno, p_blkno; | ||
1115 | struct inode *inode = mapping->host; | ||
1116 | |||
1117 | new = phys == 0 ? 1 : 0; | ||
1118 | if (new || unwritten) | ||
1119 | should_zero = 1; | ||
1098 | 1120 | ||
1099 | if (new) { | 1121 | if (new) { |
1122 | u32 tmp_pos; | ||
1123 | |||
1100 | /* | 1124 | /* |
1101 | * This is safe to call with the page locks - it won't take | 1125 | * This is safe to call with the page locks - it won't take |
1102 | * any additional semaphores or cluster locks. | 1126 | * any additional semaphores or cluster locks. |
1103 | */ | 1127 | */ |
1104 | tmp_pos = wc->w_cpos; | 1128 | tmp_pos = cpos; |
1105 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | 1129 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, |
1106 | &tmp_pos, 1, di_bh, handle, | 1130 | &tmp_pos, 1, 0, wc->w_di_bh, |
1107 | data_ac, meta_ac, NULL); | 1131 | wc->w_handle, data_ac, |
1132 | meta_ac, NULL); | ||
1108 | /* | 1133 | /* |
1109 | * This shouldn't happen because we must have already | 1134 | * This shouldn't happen because we must have already |
1110 | * calculated the correct meta data allocation required. The | 1135 | * calculated the correct meta data allocation required. The |
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | |||
1121 | mlog_errno(ret); | 1146 | mlog_errno(ret); |
1122 | goto out; | 1147 | goto out; |
1123 | } | 1148 | } |
1149 | } else if (unwritten) { | ||
1150 | ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, | ||
1151 | wc->w_handle, cpos, 1, phys, | ||
1152 | meta_ac, &wc->w_dealloc); | ||
1153 | if (ret < 0) { | ||
1154 | mlog_errno(ret); | ||
1155 | goto out; | ||
1156 | } | ||
1124 | } | 1157 | } |
1125 | 1158 | ||
1159 | if (should_zero) | ||
1160 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); | ||
1161 | else | ||
1162 | v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; | ||
1163 | |||
1164 | /* | ||
1165 | * The only reason this should fail is due to an inability to | ||
1166 | * find the extent added. | ||
1167 | */ | ||
1126 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | 1168 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, |
1127 | NULL); | 1169 | NULL); |
1128 | if (ret < 0) { | 1170 | if (ret < 0) { |
1129 | 1171 | ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " | |
1130 | /* | 1172 | "at logical block %llu", |
1131 | * XXX: Should we go readonly here? | 1173 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1132 | */ | 1174 | (unsigned long long)v_blkno); |
1133 | |||
1134 | mlog_errno(ret); | ||
1135 | goto out; | 1175 | goto out; |
1136 | } | 1176 | } |
1137 | 1177 | ||
1138 | BUG_ON(p_blkno == 0); | 1178 | BUG_ON(p_blkno == 0); |
1139 | 1179 | ||
1140 | for(i = 0; i < numpages; i++) { | 1180 | for(i = 0; i < wc->w_num_pages; i++) { |
1141 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | 1181 | int tmpret; |
1142 | wc, new); | 1182 | |
1143 | if (ret < 0) { | 1183 | tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, |
1144 | mlog_errno(ret); | 1184 | wc->w_pages[i], cpos, |
1145 | goto out; | 1185 | user_pos, user_len, |
1186 | should_zero); | ||
1187 | if (tmpret) { | ||
1188 | mlog_errno(tmpret); | ||
1189 | if (ret == 0) | ||
1190 | tmpret = ret; | ||
1146 | } | 1191 | } |
1147 | |||
1148 | copied += ret; | ||
1149 | } | 1192 | } |
1150 | 1193 | ||
1194 | /* | ||
1195 | * We only have cleanup to do in case of allocating write. | ||
1196 | */ | ||
1197 | if (ret && new) | ||
1198 | ocfs2_write_failure(inode, wc, user_pos, user_len); | ||
1199 | |||
1151 | out: | 1200 | out: |
1152 | for(i = 0; i < numpages; i++) { | 1201 | |
1153 | unlock_page(cpages[i]); | 1202 | return ret; |
1154 | mark_page_accessed(cpages[i]); | 1203 | } |
1155 | page_cache_release(cpages[i]); | 1204 | |
1205 | static int ocfs2_write_cluster_by_desc(struct address_space *mapping, | ||
1206 | struct ocfs2_alloc_context *data_ac, | ||
1207 | struct ocfs2_alloc_context *meta_ac, | ||
1208 | struct ocfs2_write_ctxt *wc, | ||
1209 | loff_t pos, unsigned len) | ||
1210 | { | ||
1211 | int ret, i; | ||
1212 | struct ocfs2_write_cluster_desc *desc; | ||
1213 | |||
1214 | for (i = 0; i < wc->w_clen; i++) { | ||
1215 | desc = &wc->w_desc[i]; | ||
1216 | |||
1217 | ret = ocfs2_write_cluster(mapping, desc->c_phys, | ||
1218 | desc->c_unwritten, data_ac, meta_ac, | ||
1219 | wc, desc->c_cpos, pos, len); | ||
1220 | if (ret) { | ||
1221 | mlog_errno(ret); | ||
1222 | goto out; | ||
1223 | } | ||
1156 | } | 1224 | } |
1157 | kfree(cpages); | ||
1158 | 1225 | ||
1159 | return copied ? copied : ret; | 1226 | ret = 0; |
1227 | out: | ||
1228 | return ret; | ||
1160 | } | 1229 | } |
1161 | 1230 | ||
1162 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | 1231 | /* |
1163 | struct ocfs2_super *osb, loff_t pos, | 1232 | * ocfs2_write_end() wants to know which parts of the target page it |
1164 | size_t count, ocfs2_page_writer *cb, | 1233 | * should complete the write on. It's easiest to compute them ahead of |
1165 | void *cb_priv) | 1234 | * time when a more complete view of the write is available. |
1235 | */ | ||
1236 | static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, | ||
1237 | struct ocfs2_write_ctxt *wc, | ||
1238 | loff_t pos, unsigned len, int alloc) | ||
1166 | { | 1239 | { |
1167 | wc->w_count = count; | 1240 | struct ocfs2_write_cluster_desc *desc; |
1168 | wc->w_pos = pos; | ||
1169 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
1170 | wc->w_finished_copy = 0; | ||
1171 | 1241 | ||
1172 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | 1242 | wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); |
1173 | wc->w_large_pages = 1; | 1243 | wc->w_target_to = wc->w_target_from + len; |
1174 | else | ||
1175 | wc->w_large_pages = 0; | ||
1176 | 1244 | ||
1177 | wc->w_write_data_page = cb; | 1245 | if (alloc == 0) |
1178 | wc->w_private = cb_priv; | 1246 | return; |
1247 | |||
1248 | /* | ||
1249 | * Allocating write - we may have different boundaries based | ||
1250 | * on page size and cluster size. | ||
1251 | * | ||
1252 | * NOTE: We can no longer compute one value from the other as | ||
1253 | * the actual write length and user provided length may be | ||
1254 | * different. | ||
1255 | */ | ||
1256 | |||
1257 | if (wc->w_large_pages) { | ||
1258 | /* | ||
1259 | * We only care about the 1st and last cluster within | ||
1260 | * our range and whether they should be zero'd or not. Either | ||
1261 | * value may be extended out to the start/end of a | ||
1262 | * newly allocated cluster. | ||
1263 | */ | ||
1264 | desc = &wc->w_desc[0]; | ||
1265 | if (ocfs2_should_zero_cluster(desc)) | ||
1266 | ocfs2_figure_cluster_boundaries(osb, | ||
1267 | desc->c_cpos, | ||
1268 | &wc->w_target_from, | ||
1269 | NULL); | ||
1270 | |||
1271 | desc = &wc->w_desc[wc->w_clen - 1]; | ||
1272 | if (ocfs2_should_zero_cluster(desc)) | ||
1273 | ocfs2_figure_cluster_boundaries(osb, | ||
1274 | desc->c_cpos, | ||
1275 | NULL, | ||
1276 | &wc->w_target_to); | ||
1277 | } else { | ||
1278 | wc->w_target_from = 0; | ||
1279 | wc->w_target_to = PAGE_CACHE_SIZE; | ||
1280 | } | ||
1179 | } | 1281 | } |
1180 | 1282 | ||
1181 | /* | 1283 | /* |
1182 | * Write a cluster to an inode. The cluster may not be allocated yet, | 1284 | * Populate each single-cluster write descriptor in the write context |
1183 | * in which case it will be. This only exists for buffered writes - | 1285 | * with information about the i/o to be done. |
1184 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
1185 | * | ||
1186 | * The caller is responsible for incrementing pos, written counts, etc | ||
1187 | * | 1286 | * |
1188 | * For file systems that don't support sparse files, pre-allocation | 1287 | * Returns the number of clusters that will have to be allocated, as |
1189 | * and page zeroing up until cpos should be done prior to this | 1288 | * well as a worst case estimate of the number of extent records that |
1190 | * function call. | 1289 | * would have to be created during a write to an unwritten region. |
1191 | * | ||
1192 | * Callers should be holding i_sem, and the rw cluster lock. | ||
1193 | * | ||
1194 | * Returns the number of user bytes written, or less than zero for | ||
1195 | * error. | ||
1196 | */ | 1290 | */ |
1197 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | 1291 | static int ocfs2_populate_write_desc(struct inode *inode, |
1198 | size_t count, ocfs2_page_writer *actor, | 1292 | struct ocfs2_write_ctxt *wc, |
1199 | void *priv) | 1293 | unsigned int *clusters_to_alloc, |
1294 | unsigned int *extents_to_split) | ||
1295 | { | ||
1296 | int ret; | ||
1297 | struct ocfs2_write_cluster_desc *desc; | ||
1298 | unsigned int num_clusters = 0; | ||
1299 | unsigned int ext_flags = 0; | ||
1300 | u32 phys = 0; | ||
1301 | int i; | ||
1302 | |||
1303 | *clusters_to_alloc = 0; | ||
1304 | *extents_to_split = 0; | ||
1305 | |||
1306 | for (i = 0; i < wc->w_clen; i++) { | ||
1307 | desc = &wc->w_desc[i]; | ||
1308 | desc->c_cpos = wc->w_cpos + i; | ||
1309 | |||
1310 | if (num_clusters == 0) { | ||
1311 | /* | ||
1312 | * Need to look up the next extent record. | ||
1313 | */ | ||
1314 | ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, | ||
1315 | &num_clusters, &ext_flags); | ||
1316 | if (ret) { | ||
1317 | mlog_errno(ret); | ||
1318 | goto out; | ||
1319 | } | ||
1320 | |||
1321 | /* | ||
1322 | * Assume worst case - that we're writing in | ||
1323 | * the middle of the extent. | ||
1324 | * | ||
1325 | * We can assume that the write proceeds from | ||
1326 | * left to right, in which case the extent | ||
1327 | * insert code is smart enough to coalesce the | ||
1328 | * next splits into the previous records created. | ||
1329 | */ | ||
1330 | if (ext_flags & OCFS2_EXT_UNWRITTEN) | ||
1331 | *extents_to_split = *extents_to_split + 2; | ||
1332 | } else if (phys) { | ||
1333 | /* | ||
1334 | * Only increment phys if it doesn't describe | ||
1335 | * a hole. | ||
1336 | */ | ||
1337 | phys++; | ||
1338 | } | ||
1339 | |||
1340 | desc->c_phys = phys; | ||
1341 | if (phys == 0) { | ||
1342 | desc->c_new = 1; | ||
1343 | *clusters_to_alloc = *clusters_to_alloc + 1; | ||
1344 | } | ||
1345 | if (ext_flags & OCFS2_EXT_UNWRITTEN) | ||
1346 | desc->c_unwritten = 1; | ||
1347 | |||
1348 | num_clusters--; | ||
1349 | } | ||
1350 | |||
1351 | ret = 0; | ||
1352 | out: | ||
1353 | return ret; | ||
1354 | } | ||
1355 | |||
1356 | int ocfs2_write_begin_nolock(struct address_space *mapping, | ||
1357 | loff_t pos, unsigned len, unsigned flags, | ||
1358 | struct page **pagep, void **fsdata, | ||
1359 | struct buffer_head *di_bh, struct page *mmap_page) | ||
1200 | { | 1360 | { |
1201 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | 1361 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; |
1202 | ssize_t written = 0; | 1362 | unsigned int clusters_to_alloc, extents_to_split; |
1203 | u32 phys; | 1363 | struct ocfs2_write_ctxt *wc; |
1204 | struct inode *inode = file->f_mapping->host; | 1364 | struct inode *inode = mapping->host; |
1205 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1365 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1206 | struct buffer_head *di_bh = NULL; | ||
1207 | struct ocfs2_dinode *di; | 1366 | struct ocfs2_dinode *di; |
1208 | struct ocfs2_alloc_context *data_ac = NULL; | 1367 | struct ocfs2_alloc_context *data_ac = NULL; |
1209 | struct ocfs2_alloc_context *meta_ac = NULL; | 1368 | struct ocfs2_alloc_context *meta_ac = NULL; |
1210 | handle_t *handle; | 1369 | handle_t *handle; |
1211 | struct ocfs2_write_ctxt wc; | ||
1212 | |||
1213 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | ||
1214 | 1370 | ||
1215 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | 1371 | ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); |
1216 | if (ret) { | 1372 | if (ret) { |
1217 | mlog_errno(ret); | 1373 | mlog_errno(ret); |
1218 | goto out; | 1374 | return ret; |
1219 | } | 1375 | } |
1220 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
1221 | |||
1222 | /* | ||
1223 | * Take alloc sem here to prevent concurrent lookups. That way | ||
1224 | * the mapping, zeroing and tree manipulation within | ||
1225 | * ocfs2_write() will be safe against ->readpage(). This | ||
1226 | * should also serve to lock out allocation from a shared | ||
1227 | * writeable region. | ||
1228 | */ | ||
1229 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1230 | 1376 | ||
1231 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); | 1377 | ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, |
1378 | &extents_to_split); | ||
1232 | if (ret) { | 1379 | if (ret) { |
1233 | mlog_errno(ret); | 1380 | mlog_errno(ret); |
1234 | goto out_meta; | 1381 | goto out; |
1235 | } | 1382 | } |
1236 | 1383 | ||
1237 | /* phys == 0 means that allocation is required. */ | 1384 | di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; |
1238 | if (phys == 0) { | 1385 | |
1239 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | 1386 | /* |
1387 | * We set w_target_from, w_target_to here so that | ||
1388 | * ocfs2_write_end() knows which range in the target page to | ||
1389 | * write out. An allocation requires that we write the entire | ||
1390 | * cluster range. | ||
1391 | */ | ||
1392 | if (clusters_to_alloc || extents_to_split) { | ||
1393 | /* | ||
1394 | * XXX: We are stretching the limits of | ||
1395 | * ocfs2_lock_allocators(). It greatly over-estimates | ||
1396 | * the work to be done. | ||
1397 | */ | ||
1398 | ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, | ||
1399 | extents_to_split, &data_ac, &meta_ac); | ||
1240 | if (ret) { | 1400 | if (ret) { |
1241 | mlog_errno(ret); | 1401 | mlog_errno(ret); |
1242 | goto out_meta; | 1402 | goto out; |
1243 | } | 1403 | } |
1244 | 1404 | ||
1245 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | 1405 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, |
1246 | } | 1406 | clusters_to_alloc); |
1247 | 1407 | ||
1248 | ret = ocfs2_data_lock(inode, 1); | ||
1249 | if (ret) { | ||
1250 | mlog_errno(ret); | ||
1251 | goto out_meta; | ||
1252 | } | 1408 | } |
1253 | 1409 | ||
1410 | ocfs2_set_target_boundaries(osb, wc, pos, len, | ||
1411 | clusters_to_alloc + extents_to_split); | ||
1412 | |||
1254 | handle = ocfs2_start_trans(osb, credits); | 1413 | handle = ocfs2_start_trans(osb, credits); |
1255 | if (IS_ERR(handle)) { | 1414 | if (IS_ERR(handle)) { |
1256 | ret = PTR_ERR(handle); | 1415 | ret = PTR_ERR(handle); |
1257 | mlog_errno(ret); | 1416 | mlog_errno(ret); |
1258 | goto out_data; | 1417 | goto out; |
1259 | } | 1418 | } |
1260 | 1419 | ||
1261 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | 1420 | wc->w_handle = handle; |
1262 | meta_ac, &wc); | 1421 | |
1263 | if (written < 0) { | 1422 | /* |
1264 | ret = written; | 1423 | * We don't want this to fail in ocfs2_write_end(), so do it |
1424 | * here. | ||
1425 | */ | ||
1426 | ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, | ||
1427 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1428 | if (ret) { | ||
1265 | mlog_errno(ret); | 1429 | mlog_errno(ret); |
1266 | goto out_commit; | 1430 | goto out_commit; |
1267 | } | 1431 | } |
1268 | 1432 | ||
1269 | ret = ocfs2_journal_access(handle, inode, di_bh, | 1433 | /* |
1270 | OCFS2_JOURNAL_ACCESS_WRITE); | 1434 | * Fill our page array first. That way we've grabbed enough so |
1435 | * that we can zero and flush if we error after adding the | ||
1436 | * extent. | ||
1437 | */ | ||
1438 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, | ||
1439 | clusters_to_alloc + extents_to_split, | ||
1440 | mmap_page); | ||
1271 | if (ret) { | 1441 | if (ret) { |
1272 | mlog_errno(ret); | 1442 | mlog_errno(ret); |
1273 | goto out_commit; | 1443 | goto out_commit; |
1274 | } | 1444 | } |
1275 | 1445 | ||
1276 | pos += written; | 1446 | ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, |
1447 | len); | ||
1448 | if (ret) { | ||
1449 | mlog_errno(ret); | ||
1450 | goto out_commit; | ||
1451 | } | ||
1452 | |||
1453 | if (data_ac) | ||
1454 | ocfs2_free_alloc_context(data_ac); | ||
1455 | if (meta_ac) | ||
1456 | ocfs2_free_alloc_context(meta_ac); | ||
1457 | |||
1458 | *pagep = wc->w_target_page; | ||
1459 | *fsdata = wc; | ||
1460 | return 0; | ||
1461 | out_commit: | ||
1462 | ocfs2_commit_trans(osb, handle); | ||
1463 | |||
1464 | out: | ||
1465 | ocfs2_free_write_ctxt(wc); | ||
1466 | |||
1467 | if (data_ac) | ||
1468 | ocfs2_free_alloc_context(data_ac); | ||
1469 | if (meta_ac) | ||
1470 | ocfs2_free_alloc_context(meta_ac); | ||
1471 | return ret; | ||
1472 | } | ||
1473 | |||
1474 | int ocfs2_write_begin(struct file *file, struct address_space *mapping, | ||
1475 | loff_t pos, unsigned len, unsigned flags, | ||
1476 | struct page **pagep, void **fsdata) | ||
1477 | { | ||
1478 | int ret; | ||
1479 | struct buffer_head *di_bh = NULL; | ||
1480 | struct inode *inode = mapping->host; | ||
1481 | |||
1482 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
1483 | if (ret) { | ||
1484 | mlog_errno(ret); | ||
1485 | return ret; | ||
1486 | } | ||
1487 | |||
1488 | /* | ||
1489 | * Take alloc sem here to prevent concurrent lookups. That way | ||
1490 | * the mapping, zeroing and tree manipulation within | ||
1491 | * ocfs2_write() will be safe against ->readpage(). This | ||
1492 | * should also serve to lock out allocation from a shared | ||
1493 | * writeable region. | ||
1494 | */ | ||
1495 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1496 | |||
1497 | ret = ocfs2_data_lock(inode, 1); | ||
1498 | if (ret) { | ||
1499 | mlog_errno(ret); | ||
1500 | goto out_fail; | ||
1501 | } | ||
1502 | |||
1503 | ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, | ||
1504 | fsdata, di_bh, NULL); | ||
1505 | if (ret) { | ||
1506 | mlog_errno(ret); | ||
1507 | goto out_fail_data; | ||
1508 | } | ||
1509 | |||
1510 | brelse(di_bh); | ||
1511 | |||
1512 | return 0; | ||
1513 | |||
1514 | out_fail_data: | ||
1515 | ocfs2_data_unlock(inode, 1); | ||
1516 | out_fail: | ||
1517 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1518 | |||
1519 | brelse(di_bh); | ||
1520 | ocfs2_meta_unlock(inode, 1); | ||
1521 | |||
1522 | return ret; | ||
1523 | } | ||
1524 | |||
1525 | int ocfs2_write_end_nolock(struct address_space *mapping, | ||
1526 | loff_t pos, unsigned len, unsigned copied, | ||
1527 | struct page *page, void *fsdata) | ||
1528 | { | ||
1529 | int i; | ||
1530 | unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); | ||
1531 | struct inode *inode = mapping->host; | ||
1532 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1533 | struct ocfs2_write_ctxt *wc = fsdata; | ||
1534 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; | ||
1535 | handle_t *handle = wc->w_handle; | ||
1536 | struct page *tmppage; | ||
1537 | |||
1538 | if (unlikely(copied < len)) { | ||
1539 | if (!PageUptodate(wc->w_target_page)) | ||
1540 | copied = 0; | ||
1541 | |||
1542 | ocfs2_zero_new_buffers(wc->w_target_page, start+copied, | ||
1543 | start+len); | ||
1544 | } | ||
1545 | flush_dcache_page(wc->w_target_page); | ||
1546 | |||
1547 | for(i = 0; i < wc->w_num_pages; i++) { | ||
1548 | tmppage = wc->w_pages[i]; | ||
1549 | |||
1550 | if (tmppage == wc->w_target_page) { | ||
1551 | from = wc->w_target_from; | ||
1552 | to = wc->w_target_to; | ||
1553 | |||
1554 | BUG_ON(from > PAGE_CACHE_SIZE || | ||
1555 | to > PAGE_CACHE_SIZE || | ||
1556 | to < from); | ||
1557 | } else { | ||
1558 | /* | ||
1559 | * Pages adjacent to the target (if any) imply | ||
1560 | * a hole-filling write in which case we want | ||
1561 | * to flush their entire range. | ||
1562 | */ | ||
1563 | from = 0; | ||
1564 | to = PAGE_CACHE_SIZE; | ||
1565 | } | ||
1566 | |||
1567 | if (ocfs2_should_order_data(inode)) | ||
1568 | walk_page_buffers(wc->w_handle, page_buffers(tmppage), | ||
1569 | from, to, NULL, | ||
1570 | ocfs2_journal_dirty_data); | ||
1571 | |||
1572 | block_commit_write(tmppage, from, to); | ||
1573 | } | ||
1574 | |||
1575 | pos += copied; | ||
1277 | if (pos > inode->i_size) { | 1576 | if (pos > inode->i_size) { |
1278 | i_size_write(inode, pos); | 1577 | i_size_write(inode, pos); |
1279 | mark_inode_dirty(inode); | 1578 | mark_inode_dirty(inode); |
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | |||
1283 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 1582 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
1284 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | 1583 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); |
1285 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | 1584 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); |
1585 | ocfs2_journal_dirty(handle, wc->w_di_bh); | ||
1286 | 1586 | ||
1287 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
1288 | if (ret) | ||
1289 | mlog_errno(ret); | ||
1290 | |||
1291 | out_commit: | ||
1292 | ocfs2_commit_trans(osb, handle); | 1587 | ocfs2_commit_trans(osb, handle); |
1293 | 1588 | ||
1294 | out_data: | 1589 | ocfs2_run_deallocs(osb, &wc->w_dealloc); |
1295 | ocfs2_data_unlock(inode, 1); | 1590 | |
1591 | ocfs2_free_write_ctxt(wc); | ||
1592 | |||
1593 | return copied; | ||
1594 | } | ||
1595 | |||
1596 | int ocfs2_write_end(struct file *file, struct address_space *mapping, | ||
1597 | loff_t pos, unsigned len, unsigned copied, | ||
1598 | struct page *page, void *fsdata) | ||
1599 | { | ||
1600 | int ret; | ||
1601 | struct inode *inode = mapping->host; | ||
1296 | 1602 | ||
1297 | out_meta: | 1603 | ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); |
1604 | |||
1605 | ocfs2_data_unlock(inode, 1); | ||
1298 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 1606 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
1299 | ocfs2_meta_unlock(inode, 1); | 1607 | ocfs2_meta_unlock(inode, 1); |
1300 | 1608 | ||
1301 | out: | 1609 | return ret; |
1302 | brelse(di_bh); | ||
1303 | if (data_ac) | ||
1304 | ocfs2_free_alloc_context(data_ac); | ||
1305 | if (meta_ac) | ||
1306 | ocfs2_free_alloc_context(meta_ac); | ||
1307 | |||
1308 | return written ? written : ret; | ||
1309 | } | 1610 | } |
1310 | 1611 | ||
1311 | const struct address_space_operations ocfs2_aops = { | 1612 | const struct address_space_operations ocfs2_aops = { |