diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2007-02-09 23:24:12 -0500 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2007-04-26 18:02:08 -0400 |
commit | 9517bac6cc7a7aa4fee63cb38a32cb6014e264c7 (patch) | |
tree | 3cac0c18d0cacc316e0e8a60f483282d6f991779 /fs/ocfs2/aops.c | |
parent | 89488984ac23b0580f959b9ee549f2fcb1c2f194 (diff) |
ocfs2: teach ocfs2_file_aio_write() about sparse files
Unfortunately, ocfs2 can no longer make use of generic_file_aio_write_nlock()
because allocating writes will require zeroing of pages adjacent to the I/O
for cluster sizes greater than page size.
Implement a custom file write here, which can order page locks for zeroing.
This also has the advantage that cluster locks can easily be ordered outside
of the page locks.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2/aops.c')
-rw-r--r-- | fs/ocfs2/aops.c | 679 |
1 files changed, 663 insertions, 16 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f3b0cc5cba1a..5ffb3702b5e9 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/highmem.h> | 24 | #include <linux/highmem.h> |
25 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
26 | #include <asm/byteorder.h> | 26 | #include <asm/byteorder.h> |
27 | #include <linux/swap.h> | ||
27 | 28 | ||
28 | #define MLOG_MASK_PREFIX ML_FILE_IO | 29 | #define MLOG_MASK_PREFIX ML_FILE_IO |
29 | #include <cluster/masklog.h> | 30 | #include <cluster/masklog.h> |
@@ -37,6 +38,7 @@ | |||
37 | #include "file.h" | 38 | #include "file.h" |
38 | #include "inode.h" | 39 | #include "inode.h" |
39 | #include "journal.h" | 40 | #include "journal.h" |
41 | #include "suballoc.h" | ||
40 | #include "super.h" | 42 | #include "super.h" |
41 | #include "symlink.h" | 43 | #include "symlink.h" |
42 | 44 | ||
@@ -645,23 +647,27 @@ static ssize_t ocfs2_direct_IO(int rw, | |||
645 | 647 | ||
646 | mlog_entry_void(); | 648 | mlog_entry_void(); |
647 | 649 | ||
648 | /* | 650 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { |
649 | * We get PR data locks even for O_DIRECT. This allows | 651 | /* |
650 | * concurrent O_DIRECT I/O but doesn't let O_DIRECT with | 652 | * We get PR data locks even for O_DIRECT. This |
651 | * extending and buffered zeroing writes race. If they did | 653 | * allows concurrent O_DIRECT I/O but doesn't let |
652 | * race then the buffered zeroing could be written back after | 654 | * O_DIRECT with extending and buffered zeroing writes |
653 | * the O_DIRECT I/O. It's one thing to tell people not to mix | 655 | * race. If they did race then the buffered zeroing |
654 | * buffered and O_DIRECT writes, but expecting them to | 656 | * could be written back after the O_DIRECT I/O. It's |
655 | * understand that file extension is also an implicit buffered | 657 | * one thing to tell people not to mix buffered and |
656 | * write is too much. By getting the PR we force writeback of | 658 | * O_DIRECT writes, but expecting them to understand |
657 | * the buffered zeroing before proceeding. | 659 | * that file extension is also an implicit buffered |
658 | */ | 660 | * write is too much. By getting the PR we force |
659 | ret = ocfs2_data_lock(inode, 0); | 661 | * writeback of the buffered zeroing before |
660 | if (ret < 0) { | 662 | * proceeding. |
661 | mlog_errno(ret); | 663 | */ |
662 | goto out; | 664 | ret = ocfs2_data_lock(inode, 0); |
665 | if (ret < 0) { | ||
666 | mlog_errno(ret); | ||
667 | goto out; | ||
668 | } | ||
669 | ocfs2_data_unlock(inode, 0); | ||
663 | } | 670 | } |
664 | ocfs2_data_unlock(inode, 0); | ||
665 | 671 | ||
666 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | 672 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, |
667 | inode->i_sb->s_bdev, iov, offset, | 673 | inode->i_sb->s_bdev, iov, offset, |
@@ -673,6 +679,647 @@ out: | |||
673 | return ret; | 679 | return ret; |
674 | } | 680 | } |
675 | 681 | ||
682 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, | ||
683 | u32 cpos, | ||
684 | unsigned int *start, | ||
685 | unsigned int *end) | ||
686 | { | ||
687 | unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; | ||
688 | |||
689 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { | ||
690 | unsigned int cpp; | ||
691 | |||
692 | cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); | ||
693 | |||
694 | cluster_start = cpos % cpp; | ||
695 | cluster_start = cluster_start << osb->s_clustersize_bits; | ||
696 | |||
697 | cluster_end = cluster_start + osb->s_clustersize; | ||
698 | } | ||
699 | |||
700 | BUG_ON(cluster_start > PAGE_SIZE); | ||
701 | BUG_ON(cluster_end > PAGE_SIZE); | ||
702 | |||
703 | if (start) | ||
704 | *start = cluster_start; | ||
705 | if (end) | ||
706 | *end = cluster_end; | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * 'from' and 'to' are the region in the page to avoid zeroing. | ||
711 | * | ||
712 | * If pagesize > clustersize, this function will avoid zeroing outside | ||
713 | * of the cluster boundary. | ||
714 | * | ||
715 | * from == to == 0 is code for "zero the entire cluster region" | ||
716 | */ | ||
717 | static void ocfs2_clear_page_regions(struct page *page, | ||
718 | struct ocfs2_super *osb, u32 cpos, | ||
719 | unsigned from, unsigned to) | ||
720 | { | ||
721 | void *kaddr; | ||
722 | unsigned int cluster_start, cluster_end; | ||
723 | |||
724 | ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); | ||
725 | |||
726 | kaddr = kmap_atomic(page, KM_USER0); | ||
727 | |||
728 | if (from || to) { | ||
729 | if (from > cluster_start) | ||
730 | memset(kaddr + cluster_start, 0, from - cluster_start); | ||
731 | if (to < cluster_end) | ||
732 | memset(kaddr + to, 0, cluster_end - to); | ||
733 | } else { | ||
734 | memset(kaddr + cluster_start, 0, cluster_end - cluster_start); | ||
735 | } | ||
736 | |||
737 | kunmap_atomic(kaddr, KM_USER0); | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Some of this taken from block_prepare_write(). We already have our | ||
742 | * mapping by now though, and the entire write will be allocating or | ||
743 | * it won't, so not much need to use BH_New. | ||
744 | * | ||
745 | * This will also skip zeroing, which is handled externally. | ||
746 | */ | ||
747 | static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | ||
748 | struct inode *inode, unsigned int from, | ||
749 | unsigned int to, int new) | ||
750 | { | ||
751 | int ret = 0; | ||
752 | struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; | ||
753 | unsigned int block_end, block_start; | ||
754 | unsigned int bsize = 1 << inode->i_blkbits; | ||
755 | |||
756 | if (!page_has_buffers(page)) | ||
757 | create_empty_buffers(page, bsize, 0); | ||
758 | |||
759 | head = page_buffers(page); | ||
760 | for (bh = head, block_start = 0; bh != head || !block_start; | ||
761 | bh = bh->b_this_page, block_start += bsize) { | ||
762 | block_end = block_start + bsize; | ||
763 | |||
764 | /* | ||
765 | * Ignore blocks outside of our i/o range - | ||
766 | * they may belong to unallocated clusters. | ||
767 | */ | ||
768 | if (block_start >= to || | ||
769 | (block_start + bsize) <= from) { | ||
770 | if (PageUptodate(page)) | ||
771 | set_buffer_uptodate(bh); | ||
772 | continue; | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * For an allocating write with cluster size >= page | ||
777 | * size, we always write the entire page. | ||
778 | */ | ||
779 | |||
780 | if (buffer_new(bh)) | ||
781 | clear_buffer_new(bh); | ||
782 | |||
783 | if (!buffer_mapped(bh)) { | ||
784 | map_bh(bh, inode->i_sb, *p_blkno); | ||
785 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | ||
786 | } | ||
787 | |||
788 | if (PageUptodate(page)) { | ||
789 | if (!buffer_uptodate(bh)) | ||
790 | set_buffer_uptodate(bh); | ||
791 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && | ||
792 | (block_start < from || block_end > to)) { | ||
793 | ll_rw_block(READ, 1, &bh); | ||
794 | *wait_bh++=bh; | ||
795 | } | ||
796 | |||
797 | *p_blkno = *p_blkno + 1; | ||
798 | } | ||
799 | |||
800 | /* | ||
801 | * If we issued read requests - let them complete. | ||
802 | */ | ||
803 | while(wait_bh > wait) { | ||
804 | wait_on_buffer(*--wait_bh); | ||
805 | if (!buffer_uptodate(*wait_bh)) | ||
806 | ret = -EIO; | ||
807 | } | ||
808 | |||
809 | if (ret == 0 || !new) | ||
810 | return ret; | ||
811 | |||
812 | /* | ||
813 | * If we get -EIO above, zero out any newly allocated blocks | ||
814 | * to avoid exposing stale data. | ||
815 | */ | ||
816 | bh = head; | ||
817 | block_start = 0; | ||
818 | do { | ||
819 | void *kaddr; | ||
820 | |||
821 | block_end = block_start + bsize; | ||
822 | if (block_end <= from) | ||
823 | goto next_bh; | ||
824 | if (block_start >= to) | ||
825 | break; | ||
826 | |||
827 | kaddr = kmap_atomic(page, KM_USER0); | ||
828 | memset(kaddr+block_start, 0, bh->b_size); | ||
829 | flush_dcache_page(page); | ||
830 | kunmap_atomic(kaddr, KM_USER0); | ||
831 | set_buffer_uptodate(bh); | ||
832 | mark_buffer_dirty(bh); | ||
833 | |||
834 | next_bh: | ||
835 | block_start = block_end; | ||
836 | bh = bh->b_this_page; | ||
837 | } while (bh != head); | ||
838 | |||
839 | return ret; | ||
840 | } | ||
841 | |||
842 | /* | ||
843 | * This will copy user data from the iovec in the buffered write | ||
844 | * context. | ||
845 | */ | ||
846 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
847 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
848 | unsigned int *ret_from, unsigned int *ret_to) | ||
849 | { | ||
850 | int ret; | ||
851 | unsigned int to, from, cluster_start, cluster_end; | ||
852 | unsigned long bytes, src_from; | ||
853 | char *dst; | ||
854 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
855 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
856 | char __user *buf; | ||
857 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
858 | |||
859 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
860 | &cluster_end); | ||
861 | |||
862 | buf = cur_iov->iov_base + bp->b_cur_off; | ||
863 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | ||
864 | |||
865 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | ||
866 | |||
867 | /* | ||
868 | * This is a lot of comparisons, but it reads quite | ||
869 | * easily, which is important here. | ||
870 | */ | ||
871 | /* Stay within the src page */ | ||
872 | bytes = PAGE_SIZE - src_from; | ||
873 | /* Stay within the vector */ | ||
874 | bytes = min(bytes, | ||
875 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
876 | /* Stay within count */ | ||
877 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
878 | /* | ||
879 | * For clustersize > page size, just stay within | ||
880 | * target page, otherwise we have to calculate pos | ||
881 | * within the cluster and obey the rightmost | ||
882 | * boundary. | ||
883 | */ | ||
884 | if (wc->w_large_pages) { | ||
885 | /* | ||
886 | * For cluster size < page size, we have to | ||
887 | * calculate pos within the cluster and obey | ||
888 | * the rightmost boundary. | ||
889 | */ | ||
890 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
891 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
892 | } else { | ||
893 | /* | ||
894 | * cluster size > page size is the most common | ||
895 | * case - we just stay within the target page | ||
896 | * boundary. | ||
897 | */ | ||
898 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
899 | } | ||
900 | |||
901 | to = from + bytes; | ||
902 | |||
903 | if (wc->w_this_page_new) | ||
904 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
905 | cluster_start, cluster_end, 1); | ||
906 | else | ||
907 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
908 | from, to, 0); | ||
909 | if (ret) { | ||
910 | mlog_errno(ret); | ||
911 | goto out; | ||
912 | } | ||
913 | |||
914 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
915 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
916 | BUG_ON(from > osb->s_clustersize); | ||
917 | BUG_ON(to > osb->s_clustersize); | ||
918 | |||
919 | dst = kmap(wc->w_this_page); | ||
920 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | ||
921 | kunmap(wc->w_this_page); | ||
922 | |||
923 | /* | ||
924 | * XXX: This is slow, but simple. The caller of | ||
925 | * ocfs2_buffered_write_cluster() is responsible for | ||
926 | * passing through the iovecs, so it's difficult to | ||
927 | * predict what our next step is in here after our | ||
928 | * initial write. A future version should be pushing | ||
929 | * that iovec manipulation further down. | ||
930 | * | ||
931 | * By setting this, we indicate that a copy from user | ||
932 | * data was done, and subsequent calls for this | ||
933 | * cluster will skip copying more data. | ||
934 | */ | ||
935 | wc->w_finished_copy = 1; | ||
936 | |||
937 | *ret_from = from; | ||
938 | *ret_to = to; | ||
939 | out: | ||
940 | |||
941 | return bytes ? (unsigned int)bytes : ret; | ||
942 | } | ||
943 | |||
944 | /* | ||
945 | * Map, fill and write a page to disk. | ||
946 | * | ||
947 | * The work of copying data is done via callback. Newly allocated | ||
948 | * pages which don't take user data will be zero'd (set 'new' to | ||
949 | * indicate an allocating write) | ||
950 | * | ||
951 | * Returns a negative error code or the number of bytes copied into | ||
952 | * the page. | ||
953 | */ | ||
954 | int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
955 | u64 *p_blkno, struct page *page, | ||
956 | struct ocfs2_write_ctxt *wc, int new) | ||
957 | { | ||
958 | int ret, copied = 0; | ||
959 | unsigned int from = 0, to = 0; | ||
960 | unsigned int cluster_start, cluster_end; | ||
961 | unsigned int zero_from = 0, zero_to = 0; | ||
962 | |||
963 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | ||
964 | &cluster_start, &cluster_end); | ||
965 | |||
966 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | ||
967 | && !wc->w_finished_copy) { | ||
968 | |||
969 | wc->w_this_page = page; | ||
970 | wc->w_this_page_new = new; | ||
971 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | ||
972 | if (ret < 0) { | ||
973 | mlog_errno(ret); | ||
974 | goto out; | ||
975 | } | ||
976 | |||
977 | copied = ret; | ||
978 | |||
979 | zero_from = from; | ||
980 | zero_to = to; | ||
981 | if (new) { | ||
982 | from = cluster_start; | ||
983 | to = cluster_end; | ||
984 | } | ||
985 | } else { | ||
986 | /* | ||
987 | * If we haven't allocated the new page yet, we | ||
988 | * shouldn't be writing it out without copying user | ||
989 | * data. This is likely a math error from the caller. | ||
990 | */ | ||
991 | BUG_ON(!new); | ||
992 | |||
993 | from = cluster_start; | ||
994 | to = cluster_end; | ||
995 | |||
996 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
997 | cluster_start, cluster_end, 1); | ||
998 | if (ret) { | ||
999 | mlog_errno(ret); | ||
1000 | goto out; | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | /* | ||
1005 | * Parts of newly allocated pages need to be zero'd. | ||
1006 | * | ||
1007 | * Above, we have also rewritten 'to' and 'from' - as far as | ||
1008 | * the rest of the function is concerned, the entire cluster | ||
1009 | * range inside of a page needs to be written. | ||
1010 | * | ||
1011 | * We can skip this if the page is up to date - it's already | ||
1012 | * been zero'd from being read in as a hole. | ||
1013 | */ | ||
1014 | if (new && !PageUptodate(page)) | ||
1015 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | ||
1016 | wc->w_cpos, zero_from, zero_to); | ||
1017 | |||
1018 | flush_dcache_page(page); | ||
1019 | |||
1020 | if (ocfs2_should_order_data(inode)) { | ||
1021 | ret = walk_page_buffers(handle, | ||
1022 | page_buffers(page), | ||
1023 | from, to, NULL, | ||
1024 | ocfs2_journal_dirty_data); | ||
1025 | if (ret < 0) | ||
1026 | mlog_errno(ret); | ||
1027 | } | ||
1028 | |||
1029 | /* | ||
1030 | * We don't use generic_commit_write() because we need to | ||
1031 | * handle our own i_size update. | ||
1032 | */ | ||
1033 | ret = block_commit_write(page, from, to); | ||
1034 | if (ret) | ||
1035 | mlog_errno(ret); | ||
1036 | out: | ||
1037 | |||
1038 | return copied ? copied : ret; | ||
1039 | } | ||
1040 | |||
1041 | /* | ||
1042 | * Do the actual write of some data into an inode. Optionally allocate | ||
1043 | * in order to fulfill the write. | ||
1044 | * | ||
1045 | * cpos is the logical cluster offset within the file to write at | ||
1046 | * | ||
1047 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
1048 | * zero indicates that allocation is required. In this case, data_ac | ||
1049 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
1050 | * allocation isn't required). | ||
1051 | */ | ||
1052 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | ||
1053 | struct buffer_head *di_bh, | ||
1054 | struct ocfs2_alloc_context *data_ac, | ||
1055 | struct ocfs2_alloc_context *meta_ac, | ||
1056 | struct ocfs2_write_ctxt *wc) | ||
1057 | { | ||
1058 | int ret, i, numpages = 1, new; | ||
1059 | unsigned int copied = 0; | ||
1060 | u32 tmp_pos; | ||
1061 | u64 v_blkno, p_blkno; | ||
1062 | struct address_space *mapping = file->f_mapping; | ||
1063 | struct inode *inode = mapping->host; | ||
1064 | unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; | ||
1065 | unsigned long index, start; | ||
1066 | struct page **cpages; | ||
1067 | |||
1068 | new = phys == 0 ? 1 : 0; | ||
1069 | |||
1070 | /* | ||
1071 | * Figure out how many pages we'll be manipulating here. For | ||
1072 | * non-allocating write, or any writes where cluster size is | ||
1073 | * less than page size, we only need one page. Otherwise, | ||
1074 | * allocating writes of cluster size larger than page size | ||
1075 | * need cluster size pages. | ||
1076 | */ | ||
1077 | if (new && !wc->w_large_pages) | ||
1078 | numpages = (1 << cbits) / PAGE_SIZE; | ||
1079 | |||
1080 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
1081 | if (!cpages) { | ||
1082 | ret = -ENOMEM; | ||
1083 | mlog_errno(ret); | ||
1084 | return ret; | ||
1085 | } | ||
1086 | |||
1087 | /* | ||
1088 | * Fill our page array first. That way we've grabbed enough so | ||
1089 | * that we can zero and flush if we error after adding the | ||
1090 | * extent. | ||
1091 | */ | ||
1092 | if (new) { | ||
1093 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | ||
1094 | wc->w_cpos); | ||
1095 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
1096 | } else { | ||
1097 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | ||
1098 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | ||
1099 | } | ||
1100 | |||
1101 | for(i = 0; i < numpages; i++) { | ||
1102 | index = start + i; | ||
1103 | |||
1104 | cpages[i] = grab_cache_page(mapping, index); | ||
1105 | if (!cpages[i]) { | ||
1106 | ret = -ENOMEM; | ||
1107 | mlog_errno(ret); | ||
1108 | goto out; | ||
1109 | } | ||
1110 | } | ||
1111 | |||
1112 | if (new) { | ||
1113 | /* | ||
1114 | * This is safe to call with the page locks - it won't take | ||
1115 | * any additional semaphores or cluster locks. | ||
1116 | */ | ||
1117 | tmp_pos = wc->w_cpos; | ||
1118 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | ||
1119 | &tmp_pos, 1, di_bh, handle, | ||
1120 | data_ac, meta_ac, NULL); | ||
1121 | /* | ||
1122 | * This shouldn't happen because we must have already | ||
1123 | * calculated the correct meta data allocation required. The | ||
1124 | * internal tree allocation code should know how to increase | ||
1125 | * transaction credits itself. | ||
1126 | * | ||
1127 | * If need be, we could handle -EAGAIN for a | ||
1128 | * RESTART_TRANS here. | ||
1129 | */ | ||
1130 | mlog_bug_on_msg(ret == -EAGAIN, | ||
1131 | "Inode %llu: EAGAIN return during allocation.\n", | ||
1132 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
1133 | if (ret < 0) { | ||
1134 | mlog_errno(ret); | ||
1135 | goto out; | ||
1136 | } | ||
1137 | } | ||
1138 | |||
1139 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL); | ||
1140 | if (ret < 0) { | ||
1141 | |||
1142 | /* | ||
1143 | * XXX: Should we go readonly here? | ||
1144 | */ | ||
1145 | |||
1146 | mlog_errno(ret); | ||
1147 | goto out; | ||
1148 | } | ||
1149 | |||
1150 | BUG_ON(p_blkno == 0); | ||
1151 | |||
1152 | for(i = 0; i < numpages; i++) { | ||
1153 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | ||
1154 | wc, new); | ||
1155 | if (ret < 0) { | ||
1156 | mlog_errno(ret); | ||
1157 | goto out; | ||
1158 | } | ||
1159 | |||
1160 | copied += ret; | ||
1161 | } | ||
1162 | |||
1163 | out: | ||
1164 | for(i = 0; i < numpages; i++) { | ||
1165 | unlock_page(cpages[i]); | ||
1166 | mark_page_accessed(cpages[i]); | ||
1167 | page_cache_release(cpages[i]); | ||
1168 | } | ||
1169 | kfree(cpages); | ||
1170 | |||
1171 | return copied ? copied : ret; | ||
1172 | } | ||
1173 | |||
1174 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | ||
1175 | struct ocfs2_super *osb, loff_t pos, | ||
1176 | size_t count, ocfs2_page_writer *cb, | ||
1177 | void *cb_priv) | ||
1178 | { | ||
1179 | wc->w_count = count; | ||
1180 | wc->w_pos = pos; | ||
1181 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
1182 | wc->w_finished_copy = 0; | ||
1183 | |||
1184 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | ||
1185 | wc->w_large_pages = 1; | ||
1186 | else | ||
1187 | wc->w_large_pages = 0; | ||
1188 | |||
1189 | wc->w_write_data_page = cb; | ||
1190 | wc->w_private = cb_priv; | ||
1191 | } | ||
1192 | |||
1193 | /* | ||
1194 | * Write a cluster to an inode. The cluster may not be allocated yet, | ||
1195 | * in which case it will be. This only exists for buffered writes - | ||
1196 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
1197 | * | ||
1198 | * The caller is responsible for incrementing pos, written counts, etc | ||
1199 | * | ||
1200 | * For file systems that don't support sparse files, pre-allocation | ||
1201 | * and page zeroing up until cpos should be done prior to this | ||
1202 | * function call. | ||
1203 | * | ||
1204 | * Callers should be holding i_sem, and the rw cluster lock. | ||
1205 | * | ||
1206 | * Returns the number of user bytes written, or less than zero for | ||
1207 | * error. | ||
1208 | */ | ||
1209 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
1210 | size_t count, ocfs2_page_writer *actor, | ||
1211 | void *priv) | ||
1212 | { | ||
1213 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | ||
1214 | ssize_t written = 0; | ||
1215 | u32 phys; | ||
1216 | struct inode *inode = file->f_mapping->host; | ||
1217 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1218 | struct buffer_head *di_bh = NULL; | ||
1219 | struct ocfs2_dinode *di; | ||
1220 | struct ocfs2_alloc_context *data_ac = NULL; | ||
1221 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
1222 | handle_t *handle; | ||
1223 | struct ocfs2_write_ctxt wc; | ||
1224 | |||
1225 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | ||
1226 | |||
1227 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
1228 | if (ret) { | ||
1229 | mlog_errno(ret); | ||
1230 | goto out; | ||
1231 | } | ||
1232 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
1233 | |||
1234 | /* | ||
1235 | * Take alloc sem here to prevent concurrent lookups. That way | ||
1236 | * the mapping, zeroing and tree manipulation within | ||
1237 | * ocfs2_write() will be safe against ->readpage(). This | ||
1238 | * should also serve to lock out allocation from a shared | ||
1239 | * writeable region. | ||
1240 | */ | ||
1241 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1242 | |||
1243 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL); | ||
1244 | if (ret) { | ||
1245 | mlog_errno(ret); | ||
1246 | goto out_meta; | ||
1247 | } | ||
1248 | |||
1249 | /* phys == 0 means that allocation is required. */ | ||
1250 | if (phys == 0) { | ||
1251 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | ||
1252 | if (ret) { | ||
1253 | mlog_errno(ret); | ||
1254 | goto out_meta; | ||
1255 | } | ||
1256 | |||
1257 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | ||
1258 | } | ||
1259 | |||
1260 | ret = ocfs2_data_lock(inode, 1); | ||
1261 | if (ret) { | ||
1262 | mlog_errno(ret); | ||
1263 | goto out_meta; | ||
1264 | } | ||
1265 | |||
1266 | handle = ocfs2_start_trans(osb, credits); | ||
1267 | if (IS_ERR(handle)) { | ||
1268 | ret = PTR_ERR(handle); | ||
1269 | mlog_errno(ret); | ||
1270 | goto out_data; | ||
1271 | } | ||
1272 | |||
1273 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | ||
1274 | meta_ac, &wc); | ||
1275 | if (written < 0) { | ||
1276 | ret = written; | ||
1277 | mlog_errno(ret); | ||
1278 | goto out_commit; | ||
1279 | } | ||
1280 | |||
1281 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
1282 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1283 | if (ret) { | ||
1284 | mlog_errno(ret); | ||
1285 | goto out_commit; | ||
1286 | } | ||
1287 | |||
1288 | pos += written; | ||
1289 | if (pos > inode->i_size) { | ||
1290 | i_size_write(inode, pos); | ||
1291 | mark_inode_dirty(inode); | ||
1292 | } | ||
1293 | inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); | ||
1294 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
1295 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
1296 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
1297 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
1298 | |||
1299 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
1300 | if (ret) | ||
1301 | mlog_errno(ret); | ||
1302 | |||
1303 | out_commit: | ||
1304 | ocfs2_commit_trans(osb, handle); | ||
1305 | |||
1306 | out_data: | ||
1307 | ocfs2_data_unlock(inode, 1); | ||
1308 | |||
1309 | out_meta: | ||
1310 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1311 | ocfs2_meta_unlock(inode, 1); | ||
1312 | |||
1313 | out: | ||
1314 | brelse(di_bh); | ||
1315 | if (data_ac) | ||
1316 | ocfs2_free_alloc_context(data_ac); | ||
1317 | if (meta_ac) | ||
1318 | ocfs2_free_alloc_context(meta_ac); | ||
1319 | |||
1320 | return written ? written : ret; | ||
1321 | } | ||
1322 | |||
676 | const struct address_space_operations ocfs2_aops = { | 1323 | const struct address_space_operations ocfs2_aops = { |
677 | .readpage = ocfs2_readpage, | 1324 | .readpage = ocfs2_readpage, |
678 | .writepage = ocfs2_writepage, | 1325 | .writepage = ocfs2_writepage, |