diff options
Diffstat (limited to 'fs/ocfs2/aops.c')
| -rw-r--r-- | fs/ocfs2/aops.c | 1015 |
1 files changed, 658 insertions, 357 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a480b09c79b9..84bf6e79de23 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
| @@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
| 684 | bh = bh->b_this_page, block_start += bsize) { | 684 | bh = bh->b_this_page, block_start += bsize) { |
| 685 | block_end = block_start + bsize; | 685 | block_end = block_start + bsize; |
| 686 | 686 | ||
| 687 | clear_buffer_new(bh); | ||
| 688 | |||
| 687 | /* | 689 | /* |
| 688 | * Ignore blocks outside of our i/o range - | 690 | * Ignore blocks outside of our i/o range - |
| 689 | * they may belong to unallocated clusters. | 691 | * they may belong to unallocated clusters. |
| @@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
| 698 | * For an allocating write with cluster size >= page | 700 | * For an allocating write with cluster size >= page |
| 699 | * size, we always write the entire page. | 701 | * size, we always write the entire page. |
| 700 | */ | 702 | */ |
| 701 | 703 | if (new) | |
| 702 | if (buffer_new(bh)) | 704 | set_buffer_new(bh); |
| 703 | clear_buffer_new(bh); | ||
| 704 | 705 | ||
| 705 | if (!buffer_mapped(bh)) { | 706 | if (!buffer_mapped(bh)) { |
| 706 | map_bh(bh, inode->i_sb, *p_blkno); | 707 | map_bh(bh, inode->i_sb, *p_blkno); |
| @@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
| 711 | if (!buffer_uptodate(bh)) | 712 | if (!buffer_uptodate(bh)) |
| 712 | set_buffer_uptodate(bh); | 713 | set_buffer_uptodate(bh); |
| 713 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && | 714 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && |
| 714 | (block_start < from || block_end > to)) { | 715 | !buffer_new(bh) && |
| 716 | (block_start < from || block_end > to)) { | ||
| 715 | ll_rw_block(READ, 1, &bh); | 717 | ll_rw_block(READ, 1, &bh); |
| 716 | *wait_bh++=bh; | 718 | *wait_bh++=bh; |
| 717 | } | 719 | } |
| @@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
| 738 | bh = head; | 740 | bh = head; |
| 739 | block_start = 0; | 741 | block_start = 0; |
| 740 | do { | 742 | do { |
| 741 | void *kaddr; | ||
| 742 | |||
| 743 | block_end = block_start + bsize; | 743 | block_end = block_start + bsize; |
| 744 | if (block_end <= from) | 744 | if (block_end <= from) |
| 745 | goto next_bh; | 745 | goto next_bh; |
| 746 | if (block_start >= to) | 746 | if (block_start >= to) |
| 747 | break; | 747 | break; |
| 748 | 748 | ||
| 749 | kaddr = kmap_atomic(page, KM_USER0); | 749 | zero_user_page(page, block_start, bh->b_size, KM_USER0); |
| 750 | memset(kaddr+block_start, 0, bh->b_size); | ||
| 751 | flush_dcache_page(page); | ||
| 752 | kunmap_atomic(kaddr, KM_USER0); | ||
| 753 | set_buffer_uptodate(bh); | 750 | set_buffer_uptodate(bh); |
| 754 | mark_buffer_dirty(bh); | 751 | mark_buffer_dirty(bh); |
| 755 | 752 | ||
| @@ -761,217 +758,240 @@ next_bh: | |||
| 761 | return ret; | 758 | return ret; |
| 762 | } | 759 | } |
| 763 | 760 | ||
| 761 | #if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) | ||
| 762 | #define OCFS2_MAX_CTXT_PAGES 1 | ||
| 763 | #else | ||
| 764 | #define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) | ||
| 765 | #endif | ||
| 766 | |||
| 767 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) | ||
| 768 | |||
| 764 | /* | 769 | /* |
| 765 | * This will copy user data from the buffer page in the splice | 770 | * Describe the state of a single cluster to be written to. |
| 766 | * context. | ||
| 767 | * | ||
| 768 | * For now, we ignore SPLICE_F_MOVE as that would require some extra | ||
| 769 | * communication out all the way to ocfs2_write(). | ||
| 770 | */ | 771 | */ |
| 771 | int ocfs2_map_and_write_splice_data(struct inode *inode, | 772 | struct ocfs2_write_cluster_desc { |
| 772 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | 773 | u32 c_cpos; |
| 773 | unsigned int *ret_from, unsigned int *ret_to) | 774 | u32 c_phys; |
| 775 | /* | ||
| 776 | * Give this a unique field because c_phys eventually gets | ||
| 777 | * filled. | ||
| 778 | */ | ||
| 779 | unsigned c_new; | ||
| 780 | unsigned c_unwritten; | ||
| 781 | }; | ||
| 782 | |||
| 783 | static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) | ||
| 774 | { | 784 | { |
| 775 | int ret; | 785 | return d->c_new || d->c_unwritten; |
| 776 | unsigned int to, from, cluster_start, cluster_end; | 786 | } |
| 777 | char *src, *dst; | ||
| 778 | struct ocfs2_splice_write_priv *sp = wc->w_private; | ||
| 779 | struct pipe_buffer *buf = sp->s_buf; | ||
| 780 | unsigned long bytes, src_from; | ||
| 781 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 782 | 787 | ||
| 783 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | 788 | struct ocfs2_write_ctxt { |
| 784 | &cluster_end); | 789 | /* Logical cluster position / len of write */ |
| 790 | u32 w_cpos; | ||
| 791 | u32 w_clen; | ||
| 785 | 792 | ||
| 786 | from = sp->s_offset; | 793 | struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; |
| 787 | src_from = sp->s_buf_offset; | ||
| 788 | bytes = wc->w_count; | ||
| 789 | 794 | ||
| 790 | if (wc->w_large_pages) { | 795 | /* |
| 791 | /* | 796 | * This is true if page_size > cluster_size. |
| 792 | * For cluster size < page size, we have to | 797 | * |
| 793 | * calculate pos within the cluster and obey | 798 | * It triggers a set of special cases during write which might |
| 794 | * the rightmost boundary. | 799 | * have to deal with allocating writes to partial pages. |
| 795 | */ | 800 | */ |
| 796 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | 801 | unsigned int w_large_pages; |
| 797 | - (wc->w_pos & (osb->s_clustersize - 1)))); | 802 | |
| 798 | } | 803 | /* |
| 799 | to = from + bytes; | 804 | * Pages involved in this write. |
| 805 | * | ||
| 806 | * w_target_page is the page being written to by the user. | ||
| 807 | * | ||
| 808 | * w_pages is an array of pages which always contains | ||
| 809 | * w_target_page, and in the case of an allocating write with | ||
| 810 | * page_size < cluster size, it will contain zero'd and mapped | ||
| 811 | * pages adjacent to w_target_page which need to be written | ||
| 812 | * out in so that future reads from that region will get | ||
| 813 | * zero's. | ||
| 814 | */ | ||
| 815 | struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; | ||
| 816 | unsigned int w_num_pages; | ||
| 817 | struct page *w_target_page; | ||
| 800 | 818 | ||
| 801 | BUG_ON(from > PAGE_CACHE_SIZE); | 819 | /* |
| 802 | BUG_ON(to > PAGE_CACHE_SIZE); | 820 | * ocfs2_write_end() uses this to know what the real range to |
| 803 | BUG_ON(from < cluster_start); | 821 | * write in the target should be. |
| 804 | BUG_ON(to > cluster_end); | 822 | */ |
| 823 | unsigned int w_target_from; | ||
| 824 | unsigned int w_target_to; | ||
| 805 | 825 | ||
| 806 | if (wc->w_this_page_new) | 826 | /* |
| 807 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 827 | * We could use journal_current_handle() but this is cleaner, |
| 808 | cluster_start, cluster_end, 1); | 828 | * IMHO -Mark |
| 809 | else | 829 | */ |
| 810 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 830 | handle_t *w_handle; |
| 811 | from, to, 0); | 831 | |
| 812 | if (ret) { | 832 | struct buffer_head *w_di_bh; |
| 813 | mlog_errno(ret); | 833 | |
| 814 | goto out; | 834 | struct ocfs2_cached_dealloc_ctxt w_dealloc; |
| 835 | }; | ||
| 836 | |||
| 837 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | ||
| 838 | { | ||
| 839 | int i; | ||
| 840 | |||
| 841 | for(i = 0; i < wc->w_num_pages; i++) { | ||
| 842 | if (wc->w_pages[i] == NULL) | ||
| 843 | continue; | ||
| 844 | |||
| 845 | unlock_page(wc->w_pages[i]); | ||
| 846 | mark_page_accessed(wc->w_pages[i]); | ||
| 847 | page_cache_release(wc->w_pages[i]); | ||
| 815 | } | 848 | } |
| 816 | 849 | ||
| 817 | src = buf->ops->map(sp->s_pipe, buf, 1); | 850 | brelse(wc->w_di_bh); |
| 818 | dst = kmap_atomic(wc->w_this_page, KM_USER1); | 851 | kfree(wc); |
| 819 | memcpy(dst + from, src + src_from, bytes); | 852 | } |
| 820 | kunmap_atomic(wc->w_this_page, KM_USER1); | 853 | |
| 821 | buf->ops->unmap(sp->s_pipe, buf, src); | 854 | static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, |
| 855 | struct ocfs2_super *osb, loff_t pos, | ||
| 856 | unsigned len, struct buffer_head *di_bh) | ||
| 857 | { | ||
| 858 | struct ocfs2_write_ctxt *wc; | ||
| 859 | |||
| 860 | wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); | ||
| 861 | if (!wc) | ||
| 862 | return -ENOMEM; | ||
| 822 | 863 | ||
| 823 | wc->w_finished_copy = 1; | 864 | wc->w_cpos = pos >> osb->s_clustersize_bits; |
| 865 | wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len); | ||
| 866 | get_bh(di_bh); | ||
| 867 | wc->w_di_bh = di_bh; | ||
| 824 | 868 | ||
| 825 | *ret_from = from; | 869 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) |
| 826 | *ret_to = to; | 870 | wc->w_large_pages = 1; |
| 827 | out: | 871 | else |
| 872 | wc->w_large_pages = 0; | ||
| 873 | |||
| 874 | ocfs2_init_dealloc_ctxt(&wc->w_dealloc); | ||
| 875 | |||
| 876 | *wcp = wc; | ||
| 828 | 877 | ||
| 829 | return bytes ? (unsigned int)bytes : ret; | 878 | return 0; |
| 830 | } | 879 | } |
| 831 | 880 | ||
| 832 | /* | 881 | /* |
| 833 | * This will copy user data from the iovec in the buffered write | 882 | * If a page has any new buffers, zero them out here, and mark them uptodate |
| 834 | * context. | 883 | * and dirty so they'll be written out (in order to prevent uninitialised |
| 884 | * block data from leaking). And clear the new bit. | ||
| 835 | */ | 885 | */ |
| 836 | int ocfs2_map_and_write_user_data(struct inode *inode, | 886 | static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) |
| 837 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
| 838 | unsigned int *ret_from, unsigned int *ret_to) | ||
| 839 | { | 887 | { |
| 840 | int ret; | 888 | unsigned int block_start, block_end; |
| 841 | unsigned int to, from, cluster_start, cluster_end; | 889 | struct buffer_head *head, *bh; |
| 842 | unsigned long bytes, src_from; | ||
| 843 | char *dst; | ||
| 844 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
| 845 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
| 846 | char __user *buf; | ||
| 847 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 848 | 890 | ||
| 849 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | 891 | BUG_ON(!PageLocked(page)); |
| 850 | &cluster_end); | 892 | if (!page_has_buffers(page)) |
| 893 | return; | ||
| 851 | 894 | ||
| 852 | buf = cur_iov->iov_base + bp->b_cur_off; | 895 | bh = head = page_buffers(page); |
| 853 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | 896 | block_start = 0; |
| 897 | do { | ||
| 898 | block_end = block_start + bh->b_size; | ||
| 854 | 899 | ||
| 855 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | 900 | if (buffer_new(bh)) { |
| 901 | if (block_end > from && block_start < to) { | ||
| 902 | if (!PageUptodate(page)) { | ||
| 903 | unsigned start, end; | ||
| 856 | 904 | ||
| 857 | /* | 905 | start = max(from, block_start); |
| 858 | * This is a lot of comparisons, but it reads quite | 906 | end = min(to, block_end); |
| 859 | * easily, which is important here. | ||
| 860 | */ | ||
| 861 | /* Stay within the src page */ | ||
| 862 | bytes = PAGE_SIZE - src_from; | ||
| 863 | /* Stay within the vector */ | ||
| 864 | bytes = min(bytes, | ||
| 865 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
| 866 | /* Stay within count */ | ||
| 867 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
| 868 | /* | ||
| 869 | * For clustersize > page size, just stay within | ||
| 870 | * target page, otherwise we have to calculate pos | ||
| 871 | * within the cluster and obey the rightmost | ||
| 872 | * boundary. | ||
| 873 | */ | ||
| 874 | if (wc->w_large_pages) { | ||
| 875 | /* | ||
| 876 | * For cluster size < page size, we have to | ||
| 877 | * calculate pos within the cluster and obey | ||
| 878 | * the rightmost boundary. | ||
| 879 | */ | ||
| 880 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
| 881 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
| 882 | } else { | ||
| 883 | /* | ||
| 884 | * cluster size > page size is the most common | ||
| 885 | * case - we just stay within the target page | ||
| 886 | * boundary. | ||
| 887 | */ | ||
| 888 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
| 889 | } | ||
| 890 | 907 | ||
| 891 | to = from + bytes; | 908 | zero_user_page(page, start, end - start, KM_USER0); |
| 909 | set_buffer_uptodate(bh); | ||
| 910 | } | ||
| 892 | 911 | ||
| 893 | BUG_ON(from > PAGE_CACHE_SIZE); | 912 | clear_buffer_new(bh); |
| 894 | BUG_ON(to > PAGE_CACHE_SIZE); | 913 | mark_buffer_dirty(bh); |
| 895 | BUG_ON(from < cluster_start); | 914 | } |
| 896 | BUG_ON(to > cluster_end); | 915 | } |
| 897 | 916 | ||
| 898 | if (wc->w_this_page_new) | 917 | block_start = block_end; |
| 899 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 918 | bh = bh->b_this_page; |
| 900 | cluster_start, cluster_end, 1); | 919 | } while (bh != head); |
| 901 | else | 920 | } |
| 902 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
| 903 | from, to, 0); | ||
| 904 | if (ret) { | ||
| 905 | mlog_errno(ret); | ||
| 906 | goto out; | ||
| 907 | } | ||
| 908 | 921 | ||
| 909 | dst = kmap(wc->w_this_page); | 922 | /* |
| 910 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | 923 | * Only called when we have a failure during allocating write to write |
| 911 | kunmap(wc->w_this_page); | 924 | * zero's to the newly allocated region. |
| 925 | */ | ||
| 926 | static void ocfs2_write_failure(struct inode *inode, | ||
| 927 | struct ocfs2_write_ctxt *wc, | ||
| 928 | loff_t user_pos, unsigned user_len) | ||
| 929 | { | ||
| 930 | int i; | ||
| 931 | unsigned from, to; | ||
| 932 | struct page *tmppage; | ||
| 912 | 933 | ||
| 913 | /* | 934 | ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len); |
| 914 | * XXX: This is slow, but simple. The caller of | ||
| 915 | * ocfs2_buffered_write_cluster() is responsible for | ||
| 916 | * passing through the iovecs, so it's difficult to | ||
| 917 | * predict what our next step is in here after our | ||
| 918 | * initial write. A future version should be pushing | ||
| 919 | * that iovec manipulation further down. | ||
| 920 | * | ||
| 921 | * By setting this, we indicate that a copy from user | ||
| 922 | * data was done, and subsequent calls for this | ||
| 923 | * cluster will skip copying more data. | ||
| 924 | */ | ||
| 925 | wc->w_finished_copy = 1; | ||
| 926 | 935 | ||
| 927 | *ret_from = from; | 936 | if (wc->w_large_pages) { |
| 928 | *ret_to = to; | 937 | from = wc->w_target_from; |
| 929 | out: | 938 | to = wc->w_target_to; |
| 939 | } else { | ||
| 940 | from = 0; | ||
| 941 | to = PAGE_CACHE_SIZE; | ||
| 942 | } | ||
| 943 | |||
| 944 | for(i = 0; i < wc->w_num_pages; i++) { | ||
| 945 | tmppage = wc->w_pages[i]; | ||
| 930 | 946 | ||
| 931 | return bytes ? (unsigned int)bytes : ret; | 947 | if (ocfs2_should_order_data(inode)) |
| 948 | walk_page_buffers(wc->w_handle, page_buffers(tmppage), | ||
| 949 | from, to, NULL, | ||
| 950 | ocfs2_journal_dirty_data); | ||
| 951 | |||
| 952 | block_commit_write(tmppage, from, to); | ||
| 953 | } | ||
| 932 | } | 954 | } |
| 933 | 955 | ||
| 934 | /* | 956 | static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, |
| 935 | * Map, fill and write a page to disk. | 957 | struct ocfs2_write_ctxt *wc, |
| 936 | * | 958 | struct page *page, u32 cpos, |
| 937 | * The work of copying data is done via callback. Newly allocated | 959 | loff_t user_pos, unsigned user_len, |
| 938 | * pages which don't take user data will be zero'd (set 'new' to | 960 | int new) |
| 939 | * indicate an allocating write) | ||
| 940 | * | ||
| 941 | * Returns a negative error code or the number of bytes copied into | ||
| 942 | * the page. | ||
| 943 | */ | ||
| 944 | static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
| 945 | u64 *p_blkno, struct page *page, | ||
| 946 | struct ocfs2_write_ctxt *wc, int new) | ||
| 947 | { | 961 | { |
| 948 | int ret, copied = 0; | 962 | int ret; |
| 949 | unsigned int from = 0, to = 0; | 963 | unsigned int map_from = 0, map_to = 0; |
| 950 | unsigned int cluster_start, cluster_end; | 964 | unsigned int cluster_start, cluster_end; |
| 951 | unsigned int zero_from = 0, zero_to = 0; | 965 | unsigned int user_data_from = 0, user_data_to = 0; |
| 952 | 966 | ||
| 953 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | 967 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, |
| 954 | &cluster_start, &cluster_end); | 968 | &cluster_start, &cluster_end); |
| 955 | 969 | ||
| 956 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | 970 | if (page == wc->w_target_page) { |
| 957 | && !wc->w_finished_copy) { | 971 | map_from = user_pos & (PAGE_CACHE_SIZE - 1); |
| 958 | 972 | map_to = map_from + user_len; | |
| 959 | wc->w_this_page = page; | 973 | |
| 960 | wc->w_this_page_new = new; | 974 | if (new) |
| 961 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | 975 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, |
| 962 | if (ret < 0) { | 976 | cluster_start, cluster_end, |
| 977 | new); | ||
| 978 | else | ||
| 979 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
| 980 | map_from, map_to, new); | ||
| 981 | if (ret) { | ||
| 963 | mlog_errno(ret); | 982 | mlog_errno(ret); |
| 964 | goto out; | 983 | goto out; |
| 965 | } | 984 | } |
| 966 | 985 | ||
| 967 | copied = ret; | 986 | user_data_from = map_from; |
| 968 | 987 | user_data_to = map_to; | |
| 969 | zero_from = from; | ||
| 970 | zero_to = to; | ||
| 971 | if (new) { | 988 | if (new) { |
| 972 | from = cluster_start; | 989 | map_from = cluster_start; |
| 973 | to = cluster_end; | 990 | map_to = cluster_end; |
| 974 | } | 991 | } |
| 992 | |||
| 993 | wc->w_target_from = map_from; | ||
| 994 | wc->w_target_to = map_to; | ||
| 975 | } else { | 995 | } else { |
| 976 | /* | 996 | /* |
| 977 | * If we haven't allocated the new page yet, we | 997 | * If we haven't allocated the new page yet, we |
| @@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | |||
| 980 | */ | 1000 | */ |
| 981 | BUG_ON(!new); | 1001 | BUG_ON(!new); |
| 982 | 1002 | ||
| 983 | from = cluster_start; | 1003 | map_from = cluster_start; |
| 984 | to = cluster_end; | 1004 | map_to = cluster_end; |
| 985 | 1005 | ||
| 986 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | 1006 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, |
| 987 | cluster_start, cluster_end, 1); | 1007 | cluster_start, cluster_end, new); |
| 988 | if (ret) { | 1008 | if (ret) { |
| 989 | mlog_errno(ret); | 1009 | mlog_errno(ret); |
| 990 | goto out; | 1010 | goto out; |
| @@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | |||
| 1003 | */ | 1023 | */ |
| 1004 | if (new && !PageUptodate(page)) | 1024 | if (new && !PageUptodate(page)) |
| 1005 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | 1025 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), |
| 1006 | wc->w_cpos, zero_from, zero_to); | 1026 | cpos, user_data_from, user_data_to); |
| 1007 | 1027 | ||
| 1008 | flush_dcache_page(page); | 1028 | flush_dcache_page(page); |
| 1009 | 1029 | ||
| 1010 | if (ocfs2_should_order_data(inode)) { | ||
| 1011 | ret = walk_page_buffers(handle, | ||
| 1012 | page_buffers(page), | ||
| 1013 | from, to, NULL, | ||
| 1014 | ocfs2_journal_dirty_data); | ||
| 1015 | if (ret < 0) | ||
| 1016 | mlog_errno(ret); | ||
| 1017 | } | ||
| 1018 | |||
| 1019 | /* | ||
| 1020 | * We don't use generic_commit_write() because we need to | ||
| 1021 | * handle our own i_size update. | ||
| 1022 | */ | ||
| 1023 | ret = block_commit_write(page, from, to); | ||
| 1024 | if (ret) | ||
| 1025 | mlog_errno(ret); | ||
| 1026 | out: | 1030 | out: |
| 1027 | 1031 | return ret; | |
| 1028 | return copied ? copied : ret; | ||
| 1029 | } | 1032 | } |
| 1030 | 1033 | ||
| 1031 | /* | 1034 | /* |
| 1032 | * Do the actual write of some data into an inode. Optionally allocate | 1035 | * This function will only grab one clusters worth of pages. |
| 1033 | * in order to fulfill the write. | ||
| 1034 | * | ||
| 1035 | * cpos is the logical cluster offset within the file to write at | ||
| 1036 | * | ||
| 1037 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
| 1038 | * zero indicates that allocation is required. In this case, data_ac | ||
| 1039 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
| 1040 | * allocation isn't required). | ||
| 1041 | */ | 1036 | */ |
| 1042 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | 1037 | static int ocfs2_grab_pages_for_write(struct address_space *mapping, |
| 1043 | struct buffer_head *di_bh, | 1038 | struct ocfs2_write_ctxt *wc, |
| 1044 | struct ocfs2_alloc_context *data_ac, | 1039 | u32 cpos, loff_t user_pos, int new, |
| 1045 | struct ocfs2_alloc_context *meta_ac, | 1040 | struct page *mmap_page) |
| 1046 | struct ocfs2_write_ctxt *wc) | ||
| 1047 | { | 1041 | { |
| 1048 | int ret, i, numpages = 1, new; | 1042 | int ret = 0, i; |
| 1049 | unsigned int copied = 0; | 1043 | unsigned long start, target_index, index; |
| 1050 | u32 tmp_pos; | ||
| 1051 | u64 v_blkno, p_blkno; | ||
| 1052 | struct address_space *mapping = file->f_mapping; | ||
| 1053 | struct inode *inode = mapping->host; | 1044 | struct inode *inode = mapping->host; |
| 1054 | unsigned long index, start; | ||
| 1055 | struct page **cpages; | ||
| 1056 | 1045 | ||
| 1057 | new = phys == 0 ? 1 : 0; | 1046 | target_index = user_pos >> PAGE_CACHE_SHIFT; |
| 1058 | 1047 | ||
| 1059 | /* | 1048 | /* |
| 1060 | * Figure out how many pages we'll be manipulating here. For | 1049 | * Figure out how many pages we'll be manipulating here. For |
| 1061 | * non allocating write, we just change the one | 1050 | * non allocating write, we just change the one |
| 1062 | * page. Otherwise, we'll need a whole clusters worth. | 1051 | * page. Otherwise, we'll need a whole clusters worth. |
| 1063 | */ | 1052 | */ |
| 1064 | if (new) | ||
| 1065 | numpages = ocfs2_pages_per_cluster(inode->i_sb); | ||
| 1066 | |||
| 1067 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
| 1068 | if (!cpages) { | ||
| 1069 | ret = -ENOMEM; | ||
| 1070 | mlog_errno(ret); | ||
| 1071 | return ret; | ||
| 1072 | } | ||
| 1073 | |||
| 1074 | /* | ||
| 1075 | * Fill our page array first. That way we've grabbed enough so | ||
| 1076 | * that we can zero and flush if we error after adding the | ||
| 1077 | * extent. | ||
| 1078 | */ | ||
| 1079 | if (new) { | 1053 | if (new) { |
| 1080 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | 1054 | wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); |
| 1081 | wc->w_cpos); | 1055 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); |
| 1082 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
| 1083 | } else { | 1056 | } else { |
| 1084 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | 1057 | wc->w_num_pages = 1; |
| 1085 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | 1058 | start = target_index; |
| 1086 | } | 1059 | } |
| 1087 | 1060 | ||
| 1088 | for(i = 0; i < numpages; i++) { | 1061 | for(i = 0; i < wc->w_num_pages; i++) { |
| 1089 | index = start + i; | 1062 | index = start + i; |
| 1090 | 1063 | ||
| 1091 | cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); | 1064 | if (index == target_index && mmap_page) { |
| 1092 | if (!cpages[i]) { | 1065 | /* |
| 1093 | ret = -ENOMEM; | 1066 | * ocfs2_pagemkwrite() is a little different |
| 1094 | mlog_errno(ret); | 1067 | * and wants us to directly use the page |
| 1095 | goto out; | 1068 | * passed in. |
| 1069 | */ | ||
| 1070 | lock_page(mmap_page); | ||
| 1071 | |||
| 1072 | if (mmap_page->mapping != mapping) { | ||
| 1073 | unlock_page(mmap_page); | ||
| 1074 | /* | ||
| 1075 | * Sanity check - the locking in | ||
| 1076 | * ocfs2_pagemkwrite() should ensure | ||
| 1077 | * that this code doesn't trigger. | ||
| 1078 | */ | ||
| 1079 | ret = -EINVAL; | ||
| 1080 | mlog_errno(ret); | ||
| 1081 | goto out; | ||
| 1082 | } | ||
| 1083 | |||
| 1084 | page_cache_get(mmap_page); | ||
| 1085 | wc->w_pages[i] = mmap_page; | ||
| 1086 | } else { | ||
| 1087 | wc->w_pages[i] = find_or_create_page(mapping, index, | ||
| 1088 | GFP_NOFS); | ||
| 1089 | if (!wc->w_pages[i]) { | ||
| 1090 | ret = -ENOMEM; | ||
| 1091 | mlog_errno(ret); | ||
| 1092 | goto out; | ||
| 1093 | } | ||
| 1096 | } | 1094 | } |
| 1095 | |||
| 1096 | if (index == target_index) | ||
| 1097 | wc->w_target_page = wc->w_pages[i]; | ||
| 1097 | } | 1098 | } |
| 1099 | out: | ||
| 1100 | return ret; | ||
| 1101 | } | ||
| 1102 | |||
| 1103 | /* | ||
| 1104 | * Prepare a single cluster for write one cluster into the file. | ||
| 1105 | */ | ||
| 1106 | static int ocfs2_write_cluster(struct address_space *mapping, | ||
| 1107 | u32 phys, unsigned int unwritten, | ||
| 1108 | struct ocfs2_alloc_context *data_ac, | ||
| 1109 | struct ocfs2_alloc_context *meta_ac, | ||
| 1110 | struct ocfs2_write_ctxt *wc, u32 cpos, | ||
| 1111 | loff_t user_pos, unsigned user_len) | ||
| 1112 | { | ||
| 1113 | int ret, i, new, should_zero = 0; | ||
| 1114 | u64 v_blkno, p_blkno; | ||
| 1115 | struct inode *inode = mapping->host; | ||
| 1116 | |||
| 1117 | new = phys == 0 ? 1 : 0; | ||
| 1118 | if (new || unwritten) | ||
| 1119 | should_zero = 1; | ||
| 1098 | 1120 | ||
| 1099 | if (new) { | 1121 | if (new) { |
| 1122 | u32 tmp_pos; | ||
| 1123 | |||
| 1100 | /* | 1124 | /* |
| 1101 | * This is safe to call with the page locks - it won't take | 1125 | * This is safe to call with the page locks - it won't take |
| 1102 | * any additional semaphores or cluster locks. | 1126 | * any additional semaphores or cluster locks. |
| 1103 | */ | 1127 | */ |
| 1104 | tmp_pos = wc->w_cpos; | 1128 | tmp_pos = cpos; |
| 1105 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | 1129 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, |
| 1106 | &tmp_pos, 1, di_bh, handle, | 1130 | &tmp_pos, 1, 0, wc->w_di_bh, |
| 1107 | data_ac, meta_ac, NULL); | 1131 | wc->w_handle, data_ac, |
| 1132 | meta_ac, NULL); | ||
| 1108 | /* | 1133 | /* |
| 1109 | * This shouldn't happen because we must have already | 1134 | * This shouldn't happen because we must have already |
| 1110 | * calculated the correct meta data allocation required. The | 1135 | * calculated the correct meta data allocation required. The |
| @@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | |||
| 1121 | mlog_errno(ret); | 1146 | mlog_errno(ret); |
| 1122 | goto out; | 1147 | goto out; |
| 1123 | } | 1148 | } |
| 1149 | } else if (unwritten) { | ||
| 1150 | ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, | ||
| 1151 | wc->w_handle, cpos, 1, phys, | ||
| 1152 | meta_ac, &wc->w_dealloc); | ||
| 1153 | if (ret < 0) { | ||
| 1154 | mlog_errno(ret); | ||
| 1155 | goto out; | ||
| 1156 | } | ||
| 1124 | } | 1157 | } |
| 1125 | 1158 | ||
| 1159 | if (should_zero) | ||
| 1160 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); | ||
| 1161 | else | ||
| 1162 | v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; | ||
| 1163 | |||
| 1164 | /* | ||
| 1165 | * The only reason this should fail is due to an inability to | ||
| 1166 | * find the extent added. | ||
| 1167 | */ | ||
| 1126 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | 1168 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, |
| 1127 | NULL); | 1169 | NULL); |
| 1128 | if (ret < 0) { | 1170 | if (ret < 0) { |
| 1129 | 1171 | ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " | |
| 1130 | /* | 1172 | "at logical block %llu", |
| 1131 | * XXX: Should we go readonly here? | 1173 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
| 1132 | */ | 1174 | (unsigned long long)v_blkno); |
| 1133 | |||
| 1134 | mlog_errno(ret); | ||
| 1135 | goto out; | 1175 | goto out; |
| 1136 | } | 1176 | } |
| 1137 | 1177 | ||
| 1138 | BUG_ON(p_blkno == 0); | 1178 | BUG_ON(p_blkno == 0); |
| 1139 | 1179 | ||
| 1140 | for(i = 0; i < numpages; i++) { | 1180 | for(i = 0; i < wc->w_num_pages; i++) { |
| 1141 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | 1181 | int tmpret; |
| 1142 | wc, new); | 1182 | |
| 1143 | if (ret < 0) { | 1183 | tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, |
| 1144 | mlog_errno(ret); | 1184 | wc->w_pages[i], cpos, |
| 1145 | goto out; | 1185 | user_pos, user_len, |
| 1186 | should_zero); | ||
| 1187 | if (tmpret) { | ||
| 1188 | mlog_errno(tmpret); | ||
| 1189 | if (ret == 0) | ||
| 1190 | tmpret = ret; | ||
| 1146 | } | 1191 | } |
| 1147 | |||
| 1148 | copied += ret; | ||
| 1149 | } | 1192 | } |
| 1150 | 1193 | ||
| 1194 | /* | ||
| 1195 | * We only have cleanup to do in case of allocating write. | ||
| 1196 | */ | ||
| 1197 | if (ret && new) | ||
| 1198 | ocfs2_write_failure(inode, wc, user_pos, user_len); | ||
| 1199 | |||
| 1151 | out: | 1200 | out: |
| 1152 | for(i = 0; i < numpages; i++) { | 1201 | |
| 1153 | unlock_page(cpages[i]); | 1202 | return ret; |
| 1154 | mark_page_accessed(cpages[i]); | 1203 | } |
| 1155 | page_cache_release(cpages[i]); | 1204 | |
| 1205 | static int ocfs2_write_cluster_by_desc(struct address_space *mapping, | ||
| 1206 | struct ocfs2_alloc_context *data_ac, | ||
| 1207 | struct ocfs2_alloc_context *meta_ac, | ||
| 1208 | struct ocfs2_write_ctxt *wc, | ||
| 1209 | loff_t pos, unsigned len) | ||
| 1210 | { | ||
| 1211 | int ret, i; | ||
| 1212 | struct ocfs2_write_cluster_desc *desc; | ||
| 1213 | |||
| 1214 | for (i = 0; i < wc->w_clen; i++) { | ||
| 1215 | desc = &wc->w_desc[i]; | ||
| 1216 | |||
| 1217 | ret = ocfs2_write_cluster(mapping, desc->c_phys, | ||
| 1218 | desc->c_unwritten, data_ac, meta_ac, | ||
| 1219 | wc, desc->c_cpos, pos, len); | ||
| 1220 | if (ret) { | ||
| 1221 | mlog_errno(ret); | ||
| 1222 | goto out; | ||
| 1223 | } | ||
| 1156 | } | 1224 | } |
| 1157 | kfree(cpages); | ||
| 1158 | 1225 | ||
| 1159 | return copied ? copied : ret; | 1226 | ret = 0; |
| 1227 | out: | ||
| 1228 | return ret; | ||
| 1160 | } | 1229 | } |
| 1161 | 1230 | ||
| 1162 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | 1231 | /* |
| 1163 | struct ocfs2_super *osb, loff_t pos, | 1232 | * ocfs2_write_end() wants to know which parts of the target page it |
| 1164 | size_t count, ocfs2_page_writer *cb, | 1233 | * should complete the write on. It's easiest to compute them ahead of |
| 1165 | void *cb_priv) | 1234 | * time when a more complete view of the write is available. |
| 1235 | */ | ||
| 1236 | static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, | ||
| 1237 | struct ocfs2_write_ctxt *wc, | ||
| 1238 | loff_t pos, unsigned len, int alloc) | ||
| 1166 | { | 1239 | { |
| 1167 | wc->w_count = count; | 1240 | struct ocfs2_write_cluster_desc *desc; |
| 1168 | wc->w_pos = pos; | ||
| 1169 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
| 1170 | wc->w_finished_copy = 0; | ||
| 1171 | 1241 | ||
| 1172 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | 1242 | wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); |
| 1173 | wc->w_large_pages = 1; | 1243 | wc->w_target_to = wc->w_target_from + len; |
| 1174 | else | ||
| 1175 | wc->w_large_pages = 0; | ||
| 1176 | 1244 | ||
| 1177 | wc->w_write_data_page = cb; | 1245 | if (alloc == 0) |
| 1178 | wc->w_private = cb_priv; | 1246 | return; |
| 1247 | |||
| 1248 | /* | ||
| 1249 | * Allocating write - we may have different boundaries based | ||
| 1250 | * on page size and cluster size. | ||
| 1251 | * | ||
| 1252 | * NOTE: We can no longer compute one value from the other as | ||
| 1253 | * the actual write length and user provided length may be | ||
| 1254 | * different. | ||
| 1255 | */ | ||
| 1256 | |||
| 1257 | if (wc->w_large_pages) { | ||
| 1258 | /* | ||
| 1259 | * We only care about the 1st and last cluster within | ||
| 1260 | * our range and whether they should be zero'd or not. Either | ||
| 1261 | * value may be extended out to the start/end of a | ||
| 1262 | * newly allocated cluster. | ||
| 1263 | */ | ||
| 1264 | desc = &wc->w_desc[0]; | ||
| 1265 | if (ocfs2_should_zero_cluster(desc)) | ||
| 1266 | ocfs2_figure_cluster_boundaries(osb, | ||
| 1267 | desc->c_cpos, | ||
| 1268 | &wc->w_target_from, | ||
| 1269 | NULL); | ||
| 1270 | |||
| 1271 | desc = &wc->w_desc[wc->w_clen - 1]; | ||
| 1272 | if (ocfs2_should_zero_cluster(desc)) | ||
| 1273 | ocfs2_figure_cluster_boundaries(osb, | ||
| 1274 | desc->c_cpos, | ||
| 1275 | NULL, | ||
| 1276 | &wc->w_target_to); | ||
| 1277 | } else { | ||
| 1278 | wc->w_target_from = 0; | ||
| 1279 | wc->w_target_to = PAGE_CACHE_SIZE; | ||
| 1280 | } | ||
| 1179 | } | 1281 | } |
| 1180 | 1282 | ||
| 1181 | /* | 1283 | /* |
| 1182 | * Write a cluster to an inode. The cluster may not be allocated yet, | 1284 | * Populate each single-cluster write descriptor in the write context |
| 1183 | * in which case it will be. This only exists for buffered writes - | 1285 | * with information about the i/o to be done. |
| 1184 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
| 1185 | * | ||
| 1186 | * The caller is responsible for incrementing pos, written counts, etc | ||
| 1187 | * | 1286 | * |
| 1188 | * For file systems that don't support sparse files, pre-allocation | 1287 | * Returns the number of clusters that will have to be allocated, as |
| 1189 | * and page zeroing up until cpos should be done prior to this | 1288 | * well as a worst case estimate of the number of extent records that |
| 1190 | * function call. | 1289 | * would have to be created during a write to an unwritten region. |
| 1191 | * | ||
| 1192 | * Callers should be holding i_sem, and the rw cluster lock. | ||
| 1193 | * | ||
| 1194 | * Returns the number of user bytes written, or less than zero for | ||
| 1195 | * error. | ||
| 1196 | */ | 1290 | */ |
| 1197 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | 1291 | static int ocfs2_populate_write_desc(struct inode *inode, |
| 1198 | size_t count, ocfs2_page_writer *actor, | 1292 | struct ocfs2_write_ctxt *wc, |
| 1199 | void *priv) | 1293 | unsigned int *clusters_to_alloc, |
| 1294 | unsigned int *extents_to_split) | ||
| 1295 | { | ||
| 1296 | int ret; | ||
| 1297 | struct ocfs2_write_cluster_desc *desc; | ||
| 1298 | unsigned int num_clusters = 0; | ||
| 1299 | unsigned int ext_flags = 0; | ||
| 1300 | u32 phys = 0; | ||
| 1301 | int i; | ||
| 1302 | |||
| 1303 | *clusters_to_alloc = 0; | ||
| 1304 | *extents_to_split = 0; | ||
| 1305 | |||
| 1306 | for (i = 0; i < wc->w_clen; i++) { | ||
| 1307 | desc = &wc->w_desc[i]; | ||
| 1308 | desc->c_cpos = wc->w_cpos + i; | ||
| 1309 | |||
| 1310 | if (num_clusters == 0) { | ||
| 1311 | /* | ||
| 1312 | * Need to look up the next extent record. | ||
| 1313 | */ | ||
| 1314 | ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, | ||
| 1315 | &num_clusters, &ext_flags); | ||
| 1316 | if (ret) { | ||
| 1317 | mlog_errno(ret); | ||
| 1318 | goto out; | ||
| 1319 | } | ||
| 1320 | |||
| 1321 | /* | ||
| 1322 | * Assume worst case - that we're writing in | ||
| 1323 | * the middle of the extent. | ||
| 1324 | * | ||
| 1325 | * We can assume that the write proceeds from | ||
| 1326 | * left to right, in which case the extent | ||
| 1327 | * insert code is smart enough to coalesce the | ||
| 1328 | * next splits into the previous records created. | ||
| 1329 | */ | ||
| 1330 | if (ext_flags & OCFS2_EXT_UNWRITTEN) | ||
| 1331 | *extents_to_split = *extents_to_split + 2; | ||
| 1332 | } else if (phys) { | ||
| 1333 | /* | ||
| 1334 | * Only increment phys if it doesn't describe | ||
| 1335 | * a hole. | ||
| 1336 | */ | ||
| 1337 | phys++; | ||
| 1338 | } | ||
| 1339 | |||
| 1340 | desc->c_phys = phys; | ||
| 1341 | if (phys == 0) { | ||
| 1342 | desc->c_new = 1; | ||
| 1343 | *clusters_to_alloc = *clusters_to_alloc + 1; | ||
| 1344 | } | ||
| 1345 | if (ext_flags & OCFS2_EXT_UNWRITTEN) | ||
| 1346 | desc->c_unwritten = 1; | ||
| 1347 | |||
| 1348 | num_clusters--; | ||
| 1349 | } | ||
| 1350 | |||
| 1351 | ret = 0; | ||
| 1352 | out: | ||
| 1353 | return ret; | ||
| 1354 | } | ||
| 1355 | |||
| 1356 | int ocfs2_write_begin_nolock(struct address_space *mapping, | ||
| 1357 | loff_t pos, unsigned len, unsigned flags, | ||
| 1358 | struct page **pagep, void **fsdata, | ||
| 1359 | struct buffer_head *di_bh, struct page *mmap_page) | ||
| 1200 | { | 1360 | { |
| 1201 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | 1361 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; |
| 1202 | ssize_t written = 0; | 1362 | unsigned int clusters_to_alloc, extents_to_split; |
| 1203 | u32 phys; | 1363 | struct ocfs2_write_ctxt *wc; |
| 1204 | struct inode *inode = file->f_mapping->host; | 1364 | struct inode *inode = mapping->host; |
| 1205 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1365 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 1206 | struct buffer_head *di_bh = NULL; | ||
| 1207 | struct ocfs2_dinode *di; | 1366 | struct ocfs2_dinode *di; |
| 1208 | struct ocfs2_alloc_context *data_ac = NULL; | 1367 | struct ocfs2_alloc_context *data_ac = NULL; |
| 1209 | struct ocfs2_alloc_context *meta_ac = NULL; | 1368 | struct ocfs2_alloc_context *meta_ac = NULL; |
| 1210 | handle_t *handle; | 1369 | handle_t *handle; |
| 1211 | struct ocfs2_write_ctxt wc; | ||
| 1212 | |||
| 1213 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | ||
| 1214 | 1370 | ||
| 1215 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | 1371 | ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); |
| 1216 | if (ret) { | 1372 | if (ret) { |
| 1217 | mlog_errno(ret); | 1373 | mlog_errno(ret); |
| 1218 | goto out; | 1374 | return ret; |
| 1219 | } | 1375 | } |
| 1220 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
| 1221 | |||
| 1222 | /* | ||
| 1223 | * Take alloc sem here to prevent concurrent lookups. That way | ||
| 1224 | * the mapping, zeroing and tree manipulation within | ||
| 1225 | * ocfs2_write() will be safe against ->readpage(). This | ||
| 1226 | * should also serve to lock out allocation from a shared | ||
| 1227 | * writeable region. | ||
| 1228 | */ | ||
| 1229 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 1230 | 1376 | ||
| 1231 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); | 1377 | ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, |
| 1378 | &extents_to_split); | ||
| 1232 | if (ret) { | 1379 | if (ret) { |
| 1233 | mlog_errno(ret); | 1380 | mlog_errno(ret); |
| 1234 | goto out_meta; | 1381 | goto out; |
| 1235 | } | 1382 | } |
| 1236 | 1383 | ||
| 1237 | /* phys == 0 means that allocation is required. */ | 1384 | di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; |
| 1238 | if (phys == 0) { | 1385 | |
| 1239 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | 1386 | /* |
| 1387 | * We set w_target_from, w_target_to here so that | ||
| 1388 | * ocfs2_write_end() knows which range in the target page to | ||
| 1389 | * write out. An allocation requires that we write the entire | ||
| 1390 | * cluster range. | ||
| 1391 | */ | ||
| 1392 | if (clusters_to_alloc || extents_to_split) { | ||
| 1393 | /* | ||
| 1394 | * XXX: We are stretching the limits of | ||
| 1395 | * ocfs2_lock_allocators(). It greatly over-estimates | ||
| 1396 | * the work to be done. | ||
| 1397 | */ | ||
| 1398 | ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, | ||
| 1399 | extents_to_split, &data_ac, &meta_ac); | ||
| 1240 | if (ret) { | 1400 | if (ret) { |
| 1241 | mlog_errno(ret); | 1401 | mlog_errno(ret); |
| 1242 | goto out_meta; | 1402 | goto out; |
| 1243 | } | 1403 | } |
| 1244 | 1404 | ||
| 1245 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | 1405 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, |
| 1246 | } | 1406 | clusters_to_alloc); |
| 1247 | 1407 | ||
| 1248 | ret = ocfs2_data_lock(inode, 1); | ||
| 1249 | if (ret) { | ||
| 1250 | mlog_errno(ret); | ||
| 1251 | goto out_meta; | ||
| 1252 | } | 1408 | } |
| 1253 | 1409 | ||
| 1410 | ocfs2_set_target_boundaries(osb, wc, pos, len, | ||
| 1411 | clusters_to_alloc + extents_to_split); | ||
| 1412 | |||
| 1254 | handle = ocfs2_start_trans(osb, credits); | 1413 | handle = ocfs2_start_trans(osb, credits); |
| 1255 | if (IS_ERR(handle)) { | 1414 | if (IS_ERR(handle)) { |
| 1256 | ret = PTR_ERR(handle); | 1415 | ret = PTR_ERR(handle); |
| 1257 | mlog_errno(ret); | 1416 | mlog_errno(ret); |
| 1258 | goto out_data; | 1417 | goto out; |
| 1259 | } | 1418 | } |
| 1260 | 1419 | ||
| 1261 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | 1420 | wc->w_handle = handle; |
| 1262 | meta_ac, &wc); | 1421 | |
| 1263 | if (written < 0) { | 1422 | /* |
| 1264 | ret = written; | 1423 | * We don't want this to fail in ocfs2_write_end(), so do it |
| 1424 | * here. | ||
| 1425 | */ | ||
| 1426 | ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, | ||
| 1427 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 1428 | if (ret) { | ||
| 1265 | mlog_errno(ret); | 1429 | mlog_errno(ret); |
| 1266 | goto out_commit; | 1430 | goto out_commit; |
| 1267 | } | 1431 | } |
| 1268 | 1432 | ||
| 1269 | ret = ocfs2_journal_access(handle, inode, di_bh, | 1433 | /* |
| 1270 | OCFS2_JOURNAL_ACCESS_WRITE); | 1434 | * Fill our page array first. That way we've grabbed enough so |
| 1435 | * that we can zero and flush if we error after adding the | ||
| 1436 | * extent. | ||
| 1437 | */ | ||
| 1438 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, | ||
| 1439 | clusters_to_alloc + extents_to_split, | ||
| 1440 | mmap_page); | ||
| 1271 | if (ret) { | 1441 | if (ret) { |
| 1272 | mlog_errno(ret); | 1442 | mlog_errno(ret); |
| 1273 | goto out_commit; | 1443 | goto out_commit; |
| 1274 | } | 1444 | } |
| 1275 | 1445 | ||
| 1276 | pos += written; | 1446 | ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, |
| 1447 | len); | ||
| 1448 | if (ret) { | ||
| 1449 | mlog_errno(ret); | ||
| 1450 | goto out_commit; | ||
| 1451 | } | ||
| 1452 | |||
| 1453 | if (data_ac) | ||
| 1454 | ocfs2_free_alloc_context(data_ac); | ||
| 1455 | if (meta_ac) | ||
| 1456 | ocfs2_free_alloc_context(meta_ac); | ||
| 1457 | |||
| 1458 | *pagep = wc->w_target_page; | ||
| 1459 | *fsdata = wc; | ||
| 1460 | return 0; | ||
| 1461 | out_commit: | ||
| 1462 | ocfs2_commit_trans(osb, handle); | ||
| 1463 | |||
| 1464 | out: | ||
| 1465 | ocfs2_free_write_ctxt(wc); | ||
| 1466 | |||
| 1467 | if (data_ac) | ||
| 1468 | ocfs2_free_alloc_context(data_ac); | ||
| 1469 | if (meta_ac) | ||
| 1470 | ocfs2_free_alloc_context(meta_ac); | ||
| 1471 | return ret; | ||
| 1472 | } | ||
| 1473 | |||
| 1474 | int ocfs2_write_begin(struct file *file, struct address_space *mapping, | ||
| 1475 | loff_t pos, unsigned len, unsigned flags, | ||
| 1476 | struct page **pagep, void **fsdata) | ||
| 1477 | { | ||
| 1478 | int ret; | ||
| 1479 | struct buffer_head *di_bh = NULL; | ||
| 1480 | struct inode *inode = mapping->host; | ||
| 1481 | |||
| 1482 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
| 1483 | if (ret) { | ||
| 1484 | mlog_errno(ret); | ||
| 1485 | return ret; | ||
| 1486 | } | ||
| 1487 | |||
| 1488 | /* | ||
| 1489 | * Take alloc sem here to prevent concurrent lookups. That way | ||
| 1490 | * the mapping, zeroing and tree manipulation within | ||
| 1491 | * ocfs2_write() will be safe against ->readpage(). This | ||
| 1492 | * should also serve to lock out allocation from a shared | ||
| 1493 | * writeable region. | ||
| 1494 | */ | ||
| 1495 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 1496 | |||
| 1497 | ret = ocfs2_data_lock(inode, 1); | ||
| 1498 | if (ret) { | ||
| 1499 | mlog_errno(ret); | ||
| 1500 | goto out_fail; | ||
| 1501 | } | ||
| 1502 | |||
| 1503 | ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, | ||
| 1504 | fsdata, di_bh, NULL); | ||
| 1505 | if (ret) { | ||
| 1506 | mlog_errno(ret); | ||
| 1507 | goto out_fail_data; | ||
| 1508 | } | ||
| 1509 | |||
| 1510 | brelse(di_bh); | ||
| 1511 | |||
| 1512 | return 0; | ||
| 1513 | |||
| 1514 | out_fail_data: | ||
| 1515 | ocfs2_data_unlock(inode, 1); | ||
| 1516 | out_fail: | ||
| 1517 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 1518 | |||
| 1519 | brelse(di_bh); | ||
| 1520 | ocfs2_meta_unlock(inode, 1); | ||
| 1521 | |||
| 1522 | return ret; | ||
| 1523 | } | ||
| 1524 | |||
| 1525 | int ocfs2_write_end_nolock(struct address_space *mapping, | ||
| 1526 | loff_t pos, unsigned len, unsigned copied, | ||
| 1527 | struct page *page, void *fsdata) | ||
| 1528 | { | ||
| 1529 | int i; | ||
| 1530 | unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); | ||
| 1531 | struct inode *inode = mapping->host; | ||
| 1532 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 1533 | struct ocfs2_write_ctxt *wc = fsdata; | ||
| 1534 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; | ||
| 1535 | handle_t *handle = wc->w_handle; | ||
| 1536 | struct page *tmppage; | ||
| 1537 | |||
| 1538 | if (unlikely(copied < len)) { | ||
| 1539 | if (!PageUptodate(wc->w_target_page)) | ||
| 1540 | copied = 0; | ||
| 1541 | |||
| 1542 | ocfs2_zero_new_buffers(wc->w_target_page, start+copied, | ||
| 1543 | start+len); | ||
| 1544 | } | ||
| 1545 | flush_dcache_page(wc->w_target_page); | ||
| 1546 | |||
| 1547 | for(i = 0; i < wc->w_num_pages; i++) { | ||
| 1548 | tmppage = wc->w_pages[i]; | ||
| 1549 | |||
| 1550 | if (tmppage == wc->w_target_page) { | ||
| 1551 | from = wc->w_target_from; | ||
| 1552 | to = wc->w_target_to; | ||
| 1553 | |||
| 1554 | BUG_ON(from > PAGE_CACHE_SIZE || | ||
| 1555 | to > PAGE_CACHE_SIZE || | ||
| 1556 | to < from); | ||
| 1557 | } else { | ||
| 1558 | /* | ||
| 1559 | * Pages adjacent to the target (if any) imply | ||
| 1560 | * a hole-filling write in which case we want | ||
| 1561 | * to flush their entire range. | ||
| 1562 | */ | ||
| 1563 | from = 0; | ||
| 1564 | to = PAGE_CACHE_SIZE; | ||
| 1565 | } | ||
| 1566 | |||
| 1567 | if (ocfs2_should_order_data(inode)) | ||
| 1568 | walk_page_buffers(wc->w_handle, page_buffers(tmppage), | ||
| 1569 | from, to, NULL, | ||
| 1570 | ocfs2_journal_dirty_data); | ||
| 1571 | |||
| 1572 | block_commit_write(tmppage, from, to); | ||
| 1573 | } | ||
| 1574 | |||
| 1575 | pos += copied; | ||
| 1277 | if (pos > inode->i_size) { | 1576 | if (pos > inode->i_size) { |
| 1278 | i_size_write(inode, pos); | 1577 | i_size_write(inode, pos); |
| 1279 | mark_inode_dirty(inode); | 1578 | mark_inode_dirty(inode); |
| @@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | |||
| 1283 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 1582 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
| 1284 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | 1583 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); |
| 1285 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | 1584 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); |
| 1585 | ocfs2_journal_dirty(handle, wc->w_di_bh); | ||
| 1286 | 1586 | ||
| 1287 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
| 1288 | if (ret) | ||
| 1289 | mlog_errno(ret); | ||
| 1290 | |||
| 1291 | out_commit: | ||
| 1292 | ocfs2_commit_trans(osb, handle); | 1587 | ocfs2_commit_trans(osb, handle); |
| 1293 | 1588 | ||
| 1294 | out_data: | 1589 | ocfs2_run_deallocs(osb, &wc->w_dealloc); |
| 1295 | ocfs2_data_unlock(inode, 1); | 1590 | |
| 1591 | ocfs2_free_write_ctxt(wc); | ||
| 1592 | |||
| 1593 | return copied; | ||
| 1594 | } | ||
| 1595 | |||
| 1596 | int ocfs2_write_end(struct file *file, struct address_space *mapping, | ||
| 1597 | loff_t pos, unsigned len, unsigned copied, | ||
| 1598 | struct page *page, void *fsdata) | ||
| 1599 | { | ||
| 1600 | int ret; | ||
| 1601 | struct inode *inode = mapping->host; | ||
| 1296 | 1602 | ||
| 1297 | out_meta: | 1603 | ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); |
| 1604 | |||
| 1605 | ocfs2_data_unlock(inode, 1); | ||
| 1298 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 1606 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
| 1299 | ocfs2_meta_unlock(inode, 1); | 1607 | ocfs2_meta_unlock(inode, 1); |
| 1300 | 1608 | ||
| 1301 | out: | 1609 | return ret; |
| 1302 | brelse(di_bh); | ||
| 1303 | if (data_ac) | ||
| 1304 | ocfs2_free_alloc_context(data_ac); | ||
| 1305 | if (meta_ac) | ||
| 1306 | ocfs2_free_alloc_context(meta_ac); | ||
| 1307 | |||
| 1308 | return written ? written : ret; | ||
| 1309 | } | 1610 | } |
| 1310 | 1611 | ||
| 1311 | const struct address_space_operations ocfs2_aops = { | 1612 | const struct address_space_operations ocfs2_aops = { |
