diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2007-02-09 23:24:12 -0500 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2007-04-26 18:02:08 -0400 |
commit | 9517bac6cc7a7aa4fee63cb38a32cb6014e264c7 (patch) | |
tree | 3cac0c18d0cacc316e0e8a60f483282d6f991779 | |
parent | 89488984ac23b0580f959b9ee549f2fcb1c2f194 (diff) |
ocfs2: teach ocfs2_file_aio_write() about sparse files
Unfortunately, ocfs2 can no longer make use of generic_file_aio_write_nlock()
because allocating writes will require zeroing of pages adjacent to the I/O
for cluster sizes greater than page size.
Implement a custom file write here, which can order page locks for zeroing.
This also has the advantage that cluster locks can easily be ordered outside
of the page locks.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
-rw-r--r-- | fs/ocfs2/aops.c | 679 | ||||
-rw-r--r-- | fs/ocfs2/aops.h | 38 | ||||
-rw-r--r-- | fs/ocfs2/extent_map.c | 4 | ||||
-rw-r--r-- | fs/ocfs2/extent_map.h | 2 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 374 | ||||
-rw-r--r-- | fs/ocfs2/file.h | 4 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 32 |
7 files changed, 1076 insertions, 57 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f3b0cc5cba1a..5ffb3702b5e9 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/highmem.h> | 24 | #include <linux/highmem.h> |
25 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
26 | #include <asm/byteorder.h> | 26 | #include <asm/byteorder.h> |
27 | #include <linux/swap.h> | ||
27 | 28 | ||
28 | #define MLOG_MASK_PREFIX ML_FILE_IO | 29 | #define MLOG_MASK_PREFIX ML_FILE_IO |
29 | #include <cluster/masklog.h> | 30 | #include <cluster/masklog.h> |
@@ -37,6 +38,7 @@ | |||
37 | #include "file.h" | 38 | #include "file.h" |
38 | #include "inode.h" | 39 | #include "inode.h" |
39 | #include "journal.h" | 40 | #include "journal.h" |
41 | #include "suballoc.h" | ||
40 | #include "super.h" | 42 | #include "super.h" |
41 | #include "symlink.h" | 43 | #include "symlink.h" |
42 | 44 | ||
@@ -645,23 +647,27 @@ static ssize_t ocfs2_direct_IO(int rw, | |||
645 | 647 | ||
646 | mlog_entry_void(); | 648 | mlog_entry_void(); |
647 | 649 | ||
648 | /* | 650 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { |
649 | * We get PR data locks even for O_DIRECT. This allows | 651 | /* |
650 | * concurrent O_DIRECT I/O but doesn't let O_DIRECT with | 652 | * We get PR data locks even for O_DIRECT. This |
651 | * extending and buffered zeroing writes race. If they did | 653 | * allows concurrent O_DIRECT I/O but doesn't let |
652 | * race then the buffered zeroing could be written back after | 654 | * O_DIRECT with extending and buffered zeroing writes |
653 | * the O_DIRECT I/O. It's one thing to tell people not to mix | 655 | * race. If they did race then the buffered zeroing |
654 | * buffered and O_DIRECT writes, but expecting them to | 656 | * could be written back after the O_DIRECT I/O. It's |
655 | * understand that file extension is also an implicit buffered | 657 | * one thing to tell people not to mix buffered and |
656 | * write is too much. By getting the PR we force writeback of | 658 | * O_DIRECT writes, but expecting them to understand |
657 | * the buffered zeroing before proceeding. | 659 | * that file extension is also an implicit buffered |
658 | */ | 660 | * write is too much. By getting the PR we force |
659 | ret = ocfs2_data_lock(inode, 0); | 661 | * writeback of the buffered zeroing before |
660 | if (ret < 0) { | 662 | * proceeding. |
661 | mlog_errno(ret); | 663 | */ |
662 | goto out; | 664 | ret = ocfs2_data_lock(inode, 0); |
665 | if (ret < 0) { | ||
666 | mlog_errno(ret); | ||
667 | goto out; | ||
668 | } | ||
669 | ocfs2_data_unlock(inode, 0); | ||
663 | } | 670 | } |
664 | ocfs2_data_unlock(inode, 0); | ||
665 | 671 | ||
666 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | 672 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, |
667 | inode->i_sb->s_bdev, iov, offset, | 673 | inode->i_sb->s_bdev, iov, offset, |
@@ -673,6 +679,647 @@ out: | |||
673 | return ret; | 679 | return ret; |
674 | } | 680 | } |
675 | 681 | ||
682 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, | ||
683 | u32 cpos, | ||
684 | unsigned int *start, | ||
685 | unsigned int *end) | ||
686 | { | ||
687 | unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; | ||
688 | |||
689 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { | ||
690 | unsigned int cpp; | ||
691 | |||
692 | cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); | ||
693 | |||
694 | cluster_start = cpos % cpp; | ||
695 | cluster_start = cluster_start << osb->s_clustersize_bits; | ||
696 | |||
697 | cluster_end = cluster_start + osb->s_clustersize; | ||
698 | } | ||
699 | |||
700 | BUG_ON(cluster_start > PAGE_SIZE); | ||
701 | BUG_ON(cluster_end > PAGE_SIZE); | ||
702 | |||
703 | if (start) | ||
704 | *start = cluster_start; | ||
705 | if (end) | ||
706 | *end = cluster_end; | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * 'from' and 'to' are the region in the page to avoid zeroing. | ||
711 | * | ||
712 | * If pagesize > clustersize, this function will avoid zeroing outside | ||
713 | * of the cluster boundary. | ||
714 | * | ||
715 | * from == to == 0 is code for "zero the entire cluster region" | ||
716 | */ | ||
717 | static void ocfs2_clear_page_regions(struct page *page, | ||
718 | struct ocfs2_super *osb, u32 cpos, | ||
719 | unsigned from, unsigned to) | ||
720 | { | ||
721 | void *kaddr; | ||
722 | unsigned int cluster_start, cluster_end; | ||
723 | |||
724 | ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); | ||
725 | |||
726 | kaddr = kmap_atomic(page, KM_USER0); | ||
727 | |||
728 | if (from || to) { | ||
729 | if (from > cluster_start) | ||
730 | memset(kaddr + cluster_start, 0, from - cluster_start); | ||
731 | if (to < cluster_end) | ||
732 | memset(kaddr + to, 0, cluster_end - to); | ||
733 | } else { | ||
734 | memset(kaddr + cluster_start, 0, cluster_end - cluster_start); | ||
735 | } | ||
736 | |||
737 | kunmap_atomic(kaddr, KM_USER0); | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Some of this taken from block_prepare_write(). We already have our | ||
742 | * mapping by now though, and the entire write will be allocating or | ||
743 | * it won't, so not much need to use BH_New. | ||
744 | * | ||
745 | * This will also skip zeroing, which is handled externally. | ||
746 | */ | ||
747 | static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | ||
748 | struct inode *inode, unsigned int from, | ||
749 | unsigned int to, int new) | ||
750 | { | ||
751 | int ret = 0; | ||
752 | struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; | ||
753 | unsigned int block_end, block_start; | ||
754 | unsigned int bsize = 1 << inode->i_blkbits; | ||
755 | |||
756 | if (!page_has_buffers(page)) | ||
757 | create_empty_buffers(page, bsize, 0); | ||
758 | |||
759 | head = page_buffers(page); | ||
760 | for (bh = head, block_start = 0; bh != head || !block_start; | ||
761 | bh = bh->b_this_page, block_start += bsize) { | ||
762 | block_end = block_start + bsize; | ||
763 | |||
764 | /* | ||
765 | * Ignore blocks outside of our i/o range - | ||
766 | * they may belong to unallocated clusters. | ||
767 | */ | ||
768 | if (block_start >= to || | ||
769 | (block_start + bsize) <= from) { | ||
770 | if (PageUptodate(page)) | ||
771 | set_buffer_uptodate(bh); | ||
772 | continue; | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * For an allocating write with cluster size >= page | ||
777 | * size, we always write the entire page. | ||
778 | */ | ||
779 | |||
780 | if (buffer_new(bh)) | ||
781 | clear_buffer_new(bh); | ||
782 | |||
783 | if (!buffer_mapped(bh)) { | ||
784 | map_bh(bh, inode->i_sb, *p_blkno); | ||
785 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | ||
786 | } | ||
787 | |||
788 | if (PageUptodate(page)) { | ||
789 | if (!buffer_uptodate(bh)) | ||
790 | set_buffer_uptodate(bh); | ||
791 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && | ||
792 | (block_start < from || block_end > to)) { | ||
793 | ll_rw_block(READ, 1, &bh); | ||
794 | *wait_bh++=bh; | ||
795 | } | ||
796 | |||
797 | *p_blkno = *p_blkno + 1; | ||
798 | } | ||
799 | |||
800 | /* | ||
801 | * If we issued read requests - let them complete. | ||
802 | */ | ||
803 | while(wait_bh > wait) { | ||
804 | wait_on_buffer(*--wait_bh); | ||
805 | if (!buffer_uptodate(*wait_bh)) | ||
806 | ret = -EIO; | ||
807 | } | ||
808 | |||
809 | if (ret == 0 || !new) | ||
810 | return ret; | ||
811 | |||
812 | /* | ||
813 | * If we get -EIO above, zero out any newly allocated blocks | ||
814 | * to avoid exposing stale data. | ||
815 | */ | ||
816 | bh = head; | ||
817 | block_start = 0; | ||
818 | do { | ||
819 | void *kaddr; | ||
820 | |||
821 | block_end = block_start + bsize; | ||
822 | if (block_end <= from) | ||
823 | goto next_bh; | ||
824 | if (block_start >= to) | ||
825 | break; | ||
826 | |||
827 | kaddr = kmap_atomic(page, KM_USER0); | ||
828 | memset(kaddr+block_start, 0, bh->b_size); | ||
829 | flush_dcache_page(page); | ||
830 | kunmap_atomic(kaddr, KM_USER0); | ||
831 | set_buffer_uptodate(bh); | ||
832 | mark_buffer_dirty(bh); | ||
833 | |||
834 | next_bh: | ||
835 | block_start = block_end; | ||
836 | bh = bh->b_this_page; | ||
837 | } while (bh != head); | ||
838 | |||
839 | return ret; | ||
840 | } | ||
841 | |||
842 | /* | ||
843 | * This will copy user data from the iovec in the buffered write | ||
844 | * context. | ||
845 | */ | ||
846 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
847 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
848 | unsigned int *ret_from, unsigned int *ret_to) | ||
849 | { | ||
850 | int ret; | ||
851 | unsigned int to, from, cluster_start, cluster_end; | ||
852 | unsigned long bytes, src_from; | ||
853 | char *dst; | ||
854 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
855 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
856 | char __user *buf; | ||
857 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
858 | |||
859 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
860 | &cluster_end); | ||
861 | |||
862 | buf = cur_iov->iov_base + bp->b_cur_off; | ||
863 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | ||
864 | |||
865 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | ||
866 | |||
867 | /* | ||
868 | * This is a lot of comparisons, but it reads quite | ||
869 | * easily, which is important here. | ||
870 | */ | ||
871 | /* Stay within the src page */ | ||
872 | bytes = PAGE_SIZE - src_from; | ||
873 | /* Stay within the vector */ | ||
874 | bytes = min(bytes, | ||
875 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
876 | /* Stay within count */ | ||
877 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
878 | /* | ||
879 | * For clustersize > page size, just stay within | ||
880 | * target page, otherwise we have to calculate pos | ||
881 | * within the cluster and obey the rightmost | ||
882 | * boundary. | ||
883 | */ | ||
884 | if (wc->w_large_pages) { | ||
885 | /* | ||
886 | * For cluster size < page size, we have to | ||
887 | * calculate pos within the cluster and obey | ||
888 | * the rightmost boundary. | ||
889 | */ | ||
890 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
891 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
892 | } else { | ||
893 | /* | ||
894 | * cluster size > page size is the most common | ||
895 | * case - we just stay within the target page | ||
896 | * boundary. | ||
897 | */ | ||
898 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
899 | } | ||
900 | |||
901 | to = from + bytes; | ||
902 | |||
903 | if (wc->w_this_page_new) | ||
904 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
905 | cluster_start, cluster_end, 1); | ||
906 | else | ||
907 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
908 | from, to, 0); | ||
909 | if (ret) { | ||
910 | mlog_errno(ret); | ||
911 | goto out; | ||
912 | } | ||
913 | |||
914 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
915 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
916 | BUG_ON(from > osb->s_clustersize); | ||
917 | BUG_ON(to > osb->s_clustersize); | ||
918 | |||
919 | dst = kmap(wc->w_this_page); | ||
920 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | ||
921 | kunmap(wc->w_this_page); | ||
922 | |||
923 | /* | ||
924 | * XXX: This is slow, but simple. The caller of | ||
925 | * ocfs2_buffered_write_cluster() is responsible for | ||
926 | * passing through the iovecs, so it's difficult to | ||
927 | * predict what our next step is in here after our | ||
928 | * initial write. A future version should be pushing | ||
929 | * that iovec manipulation further down. | ||
930 | * | ||
931 | * By setting this, we indicate that a copy from user | ||
932 | * data was done, and subsequent calls for this | ||
933 | * cluster will skip copying more data. | ||
934 | */ | ||
935 | wc->w_finished_copy = 1; | ||
936 | |||
937 | *ret_from = from; | ||
938 | *ret_to = to; | ||
939 | out: | ||
940 | |||
941 | return bytes ? (unsigned int)bytes : ret; | ||
942 | } | ||
943 | |||
944 | /* | ||
945 | * Map, fill and write a page to disk. | ||
946 | * | ||
947 | * The work of copying data is done via callback. Newly allocated | ||
948 | * pages which don't take user data will be zero'd (set 'new' to | ||
949 | * indicate an allocating write) | ||
950 | * | ||
951 | * Returns a negative error code or the number of bytes copied into | ||
952 | * the page. | ||
953 | */ | ||
954 | int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
955 | u64 *p_blkno, struct page *page, | ||
956 | struct ocfs2_write_ctxt *wc, int new) | ||
957 | { | ||
958 | int ret, copied = 0; | ||
959 | unsigned int from = 0, to = 0; | ||
960 | unsigned int cluster_start, cluster_end; | ||
961 | unsigned int zero_from = 0, zero_to = 0; | ||
962 | |||
963 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | ||
964 | &cluster_start, &cluster_end); | ||
965 | |||
966 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | ||
967 | && !wc->w_finished_copy) { | ||
968 | |||
969 | wc->w_this_page = page; | ||
970 | wc->w_this_page_new = new; | ||
971 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | ||
972 | if (ret < 0) { | ||
973 | mlog_errno(ret); | ||
974 | goto out; | ||
975 | } | ||
976 | |||
977 | copied = ret; | ||
978 | |||
979 | zero_from = from; | ||
980 | zero_to = to; | ||
981 | if (new) { | ||
982 | from = cluster_start; | ||
983 | to = cluster_end; | ||
984 | } | ||
985 | } else { | ||
986 | /* | ||
987 | * If we haven't allocated the new page yet, we | ||
988 | * shouldn't be writing it out without copying user | ||
989 | * data. This is likely a math error from the caller. | ||
990 | */ | ||
991 | BUG_ON(!new); | ||
992 | |||
993 | from = cluster_start; | ||
994 | to = cluster_end; | ||
995 | |||
996 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
997 | cluster_start, cluster_end, 1); | ||
998 | if (ret) { | ||
999 | mlog_errno(ret); | ||
1000 | goto out; | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | /* | ||
1005 | * Parts of newly allocated pages need to be zero'd. | ||
1006 | * | ||
1007 | * Above, we have also rewritten 'to' and 'from' - as far as | ||
1008 | * the rest of the function is concerned, the entire cluster | ||
1009 | * range inside of a page needs to be written. | ||
1010 | * | ||
1011 | * We can skip this if the page is up to date - it's already | ||
1012 | * been zero'd from being read in as a hole. | ||
1013 | */ | ||
1014 | if (new && !PageUptodate(page)) | ||
1015 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | ||
1016 | wc->w_cpos, zero_from, zero_to); | ||
1017 | |||
1018 | flush_dcache_page(page); | ||
1019 | |||
1020 | if (ocfs2_should_order_data(inode)) { | ||
1021 | ret = walk_page_buffers(handle, | ||
1022 | page_buffers(page), | ||
1023 | from, to, NULL, | ||
1024 | ocfs2_journal_dirty_data); | ||
1025 | if (ret < 0) | ||
1026 | mlog_errno(ret); | ||
1027 | } | ||
1028 | |||
1029 | /* | ||
1030 | * We don't use generic_commit_write() because we need to | ||
1031 | * handle our own i_size update. | ||
1032 | */ | ||
1033 | ret = block_commit_write(page, from, to); | ||
1034 | if (ret) | ||
1035 | mlog_errno(ret); | ||
1036 | out: | ||
1037 | |||
1038 | return copied ? copied : ret; | ||
1039 | } | ||
1040 | |||
1041 | /* | ||
1042 | * Do the actual write of some data into an inode. Optionally allocate | ||
1043 | * in order to fulfill the write. | ||
1044 | * | ||
1045 | * cpos is the logical cluster offset within the file to write at | ||
1046 | * | ||
1047 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
1048 | * zero indicates that allocation is required. In this case, data_ac | ||
1049 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
1050 | * allocation isn't required). | ||
1051 | */ | ||
1052 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | ||
1053 | struct buffer_head *di_bh, | ||
1054 | struct ocfs2_alloc_context *data_ac, | ||
1055 | struct ocfs2_alloc_context *meta_ac, | ||
1056 | struct ocfs2_write_ctxt *wc) | ||
1057 | { | ||
1058 | int ret, i, numpages = 1, new; | ||
1059 | unsigned int copied = 0; | ||
1060 | u32 tmp_pos; | ||
1061 | u64 v_blkno, p_blkno; | ||
1062 | struct address_space *mapping = file->f_mapping; | ||
1063 | struct inode *inode = mapping->host; | ||
1064 | unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; | ||
1065 | unsigned long index, start; | ||
1066 | struct page **cpages; | ||
1067 | |||
1068 | new = phys == 0 ? 1 : 0; | ||
1069 | |||
1070 | /* | ||
1071 | * Figure out how many pages we'll be manipulating here. For | ||
1072 | * non-allocating write, or any writes where cluster size is | ||
1073 | * less than page size, we only need one page. Otherwise, | ||
1074 | * allocating writes of cluster size larger than page size | ||
1075 | * need cluster size pages. | ||
1076 | */ | ||
1077 | if (new && !wc->w_large_pages) | ||
1078 | numpages = (1 << cbits) / PAGE_SIZE; | ||
1079 | |||
1080 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
1081 | if (!cpages) { | ||
1082 | ret = -ENOMEM; | ||
1083 | mlog_errno(ret); | ||
1084 | return ret; | ||
1085 | } | ||
1086 | |||
1087 | /* | ||
1088 | * Fill our page array first. That way we've grabbed enough so | ||
1089 | * that we can zero and flush if we error after adding the | ||
1090 | * extent. | ||
1091 | */ | ||
1092 | if (new) { | ||
1093 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | ||
1094 | wc->w_cpos); | ||
1095 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
1096 | } else { | ||
1097 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | ||
1098 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | ||
1099 | } | ||
1100 | |||
1101 | for(i = 0; i < numpages; i++) { | ||
1102 | index = start + i; | ||
1103 | |||
1104 | cpages[i] = grab_cache_page(mapping, index); | ||
1105 | if (!cpages[i]) { | ||
1106 | ret = -ENOMEM; | ||
1107 | mlog_errno(ret); | ||
1108 | goto out; | ||
1109 | } | ||
1110 | } | ||
1111 | |||
1112 | if (new) { | ||
1113 | /* | ||
1114 | * This is safe to call with the page locks - it won't take | ||
1115 | * any additional semaphores or cluster locks. | ||
1116 | */ | ||
1117 | tmp_pos = wc->w_cpos; | ||
1118 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | ||
1119 | &tmp_pos, 1, di_bh, handle, | ||
1120 | data_ac, meta_ac, NULL); | ||
1121 | /* | ||
1122 | * This shouldn't happen because we must have already | ||
1123 | * calculated the correct meta data allocation required. The | ||
1124 | * internal tree allocation code should know how to increase | ||
1125 | * transaction credits itself. | ||
1126 | * | ||
1127 | * If need be, we could handle -EAGAIN for a | ||
1128 | * RESTART_TRANS here. | ||
1129 | */ | ||
1130 | mlog_bug_on_msg(ret == -EAGAIN, | ||
1131 | "Inode %llu: EAGAIN return during allocation.\n", | ||
1132 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
1133 | if (ret < 0) { | ||
1134 | mlog_errno(ret); | ||
1135 | goto out; | ||
1136 | } | ||
1137 | } | ||
1138 | |||
1139 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL); | ||
1140 | if (ret < 0) { | ||
1141 | |||
1142 | /* | ||
1143 | * XXX: Should we go readonly here? | ||
1144 | */ | ||
1145 | |||
1146 | mlog_errno(ret); | ||
1147 | goto out; | ||
1148 | } | ||
1149 | |||
1150 | BUG_ON(p_blkno == 0); | ||
1151 | |||
1152 | for(i = 0; i < numpages; i++) { | ||
1153 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | ||
1154 | wc, new); | ||
1155 | if (ret < 0) { | ||
1156 | mlog_errno(ret); | ||
1157 | goto out; | ||
1158 | } | ||
1159 | |||
1160 | copied += ret; | ||
1161 | } | ||
1162 | |||
1163 | out: | ||
1164 | for(i = 0; i < numpages; i++) { | ||
1165 | unlock_page(cpages[i]); | ||
1166 | mark_page_accessed(cpages[i]); | ||
1167 | page_cache_release(cpages[i]); | ||
1168 | } | ||
1169 | kfree(cpages); | ||
1170 | |||
1171 | return copied ? copied : ret; | ||
1172 | } | ||
1173 | |||
1174 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | ||
1175 | struct ocfs2_super *osb, loff_t pos, | ||
1176 | size_t count, ocfs2_page_writer *cb, | ||
1177 | void *cb_priv) | ||
1178 | { | ||
1179 | wc->w_count = count; | ||
1180 | wc->w_pos = pos; | ||
1181 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
1182 | wc->w_finished_copy = 0; | ||
1183 | |||
1184 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | ||
1185 | wc->w_large_pages = 1; | ||
1186 | else | ||
1187 | wc->w_large_pages = 0; | ||
1188 | |||
1189 | wc->w_write_data_page = cb; | ||
1190 | wc->w_private = cb_priv; | ||
1191 | } | ||
1192 | |||
1193 | /* | ||
1194 | * Write a cluster to an inode. The cluster may not be allocated yet, | ||
1195 | * in which case it will be. This only exists for buffered writes - | ||
1196 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
1197 | * | ||
1198 | * The caller is responsible for incrementing pos, written counts, etc | ||
1199 | * | ||
1200 | * For file systems that don't support sparse files, pre-allocation | ||
1201 | * and page zeroing up until cpos should be done prior to this | ||
1202 | * function call. | ||
1203 | * | ||
1204 | * Callers should be holding i_sem, and the rw cluster lock. | ||
1205 | * | ||
1206 | * Returns the number of user bytes written, or less than zero for | ||
1207 | * error. | ||
1208 | */ | ||
1209 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
1210 | size_t count, ocfs2_page_writer *actor, | ||
1211 | void *priv) | ||
1212 | { | ||
1213 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | ||
1214 | ssize_t written = 0; | ||
1215 | u32 phys; | ||
1216 | struct inode *inode = file->f_mapping->host; | ||
1217 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1218 | struct buffer_head *di_bh = NULL; | ||
1219 | struct ocfs2_dinode *di; | ||
1220 | struct ocfs2_alloc_context *data_ac = NULL; | ||
1221 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
1222 | handle_t *handle; | ||
1223 | struct ocfs2_write_ctxt wc; | ||
1224 | |||
1225 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | ||
1226 | |||
1227 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
1228 | if (ret) { | ||
1229 | mlog_errno(ret); | ||
1230 | goto out; | ||
1231 | } | ||
1232 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
1233 | |||
1234 | /* | ||
1235 | * Take alloc sem here to prevent concurrent lookups. That way | ||
1236 | * the mapping, zeroing and tree manipulation within | ||
1237 | * ocfs2_write() will be safe against ->readpage(). This | ||
1238 | * should also serve to lock out allocation from a shared | ||
1239 | * writeable region. | ||
1240 | */ | ||
1241 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1242 | |||
1243 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL); | ||
1244 | if (ret) { | ||
1245 | mlog_errno(ret); | ||
1246 | goto out_meta; | ||
1247 | } | ||
1248 | |||
1249 | /* phys == 0 means that allocation is required. */ | ||
1250 | if (phys == 0) { | ||
1251 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | ||
1252 | if (ret) { | ||
1253 | mlog_errno(ret); | ||
1254 | goto out_meta; | ||
1255 | } | ||
1256 | |||
1257 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | ||
1258 | } | ||
1259 | |||
1260 | ret = ocfs2_data_lock(inode, 1); | ||
1261 | if (ret) { | ||
1262 | mlog_errno(ret); | ||
1263 | goto out_meta; | ||
1264 | } | ||
1265 | |||
1266 | handle = ocfs2_start_trans(osb, credits); | ||
1267 | if (IS_ERR(handle)) { | ||
1268 | ret = PTR_ERR(handle); | ||
1269 | mlog_errno(ret); | ||
1270 | goto out_data; | ||
1271 | } | ||
1272 | |||
1273 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | ||
1274 | meta_ac, &wc); | ||
1275 | if (written < 0) { | ||
1276 | ret = written; | ||
1277 | mlog_errno(ret); | ||
1278 | goto out_commit; | ||
1279 | } | ||
1280 | |||
1281 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
1282 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1283 | if (ret) { | ||
1284 | mlog_errno(ret); | ||
1285 | goto out_commit; | ||
1286 | } | ||
1287 | |||
1288 | pos += written; | ||
1289 | if (pos > inode->i_size) { | ||
1290 | i_size_write(inode, pos); | ||
1291 | mark_inode_dirty(inode); | ||
1292 | } | ||
1293 | inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); | ||
1294 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
1295 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
1296 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
1297 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
1298 | |||
1299 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
1300 | if (ret) | ||
1301 | mlog_errno(ret); | ||
1302 | |||
1303 | out_commit: | ||
1304 | ocfs2_commit_trans(osb, handle); | ||
1305 | |||
1306 | out_data: | ||
1307 | ocfs2_data_unlock(inode, 1); | ||
1308 | |||
1309 | out_meta: | ||
1310 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1311 | ocfs2_meta_unlock(inode, 1); | ||
1312 | |||
1313 | out: | ||
1314 | brelse(di_bh); | ||
1315 | if (data_ac) | ||
1316 | ocfs2_free_alloc_context(data_ac); | ||
1317 | if (meta_ac) | ||
1318 | ocfs2_free_alloc_context(meta_ac); | ||
1319 | |||
1320 | return written ? written : ret; | ||
1321 | } | ||
1322 | |||
676 | const struct address_space_operations ocfs2_aops = { | 1323 | const struct address_space_operations ocfs2_aops = { |
677 | .readpage = ocfs2_readpage, | 1324 | .readpage = ocfs2_readpage, |
678 | .writepage = ocfs2_writepage, | 1325 | .writepage = ocfs2_writepage, |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f446a15eab88..eeb2c42483e8 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -30,6 +30,44 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, | |||
30 | unsigned from, | 30 | unsigned from, |
31 | unsigned to); | 31 | unsigned to); |
32 | 32 | ||
33 | struct ocfs2_write_ctxt; | ||
34 | typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, | ||
35 | u64 *, unsigned int *, unsigned int *); | ||
36 | |||
37 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
38 | size_t count, ocfs2_page_writer *actor, | ||
39 | void *priv); | ||
40 | |||
41 | struct ocfs2_write_ctxt { | ||
42 | size_t w_count; | ||
43 | loff_t w_pos; | ||
44 | u32 w_cpos; | ||
45 | unsigned int w_finished_copy; | ||
46 | |||
47 | /* This is true if page_size > cluster_size */ | ||
48 | unsigned int w_large_pages; | ||
49 | |||
50 | /* Filler callback and private data */ | ||
51 | ocfs2_page_writer *w_write_data_page; | ||
52 | void *w_private; | ||
53 | |||
54 | /* Only valid for the filler callback */ | ||
55 | struct page *w_this_page; | ||
56 | unsigned int w_this_page_new; | ||
57 | }; | ||
58 | |||
59 | struct ocfs2_buffered_write_priv { | ||
60 | char *b_src_buf; | ||
61 | const struct iovec *b_cur_iov; /* Current iovec */ | ||
62 | size_t b_cur_off; /* Offset in the | ||
63 | * current iovec */ | ||
64 | }; | ||
65 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
66 | struct ocfs2_write_ctxt *wc, | ||
67 | u64 *p_blkno, | ||
68 | unsigned int *ret_from, | ||
69 | unsigned int *ret_to); | ||
70 | |||
33 | /* all ocfs2_dio_end_io()'s fault */ | 71 | /* all ocfs2_dio_end_io()'s fault */ |
34 | #define ocfs2_iocb_is_rw_locked(iocb) \ | 72 | #define ocfs2_iocb_is_rw_locked(iocb) \ |
35 | test_bit(0, (unsigned long *)&iocb->private) | 73 | test_bit(0, (unsigned long *)&iocb->private) |
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 3b4322fd369a..937c2722b753 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
@@ -67,8 +67,8 @@ static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, | |||
67 | return ret; | 67 | return ret; |
68 | } | 68 | } |
69 | 69 | ||
70 | static int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, | 70 | int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, |
71 | u32 *p_cluster, u32 *num_clusters) | 71 | u32 *p_cluster, u32 *num_clusters) |
72 | { | 72 | { |
73 | int ret, i; | 73 | int ret, i; |
74 | struct buffer_head *di_bh = NULL; | 74 | struct buffer_head *di_bh = NULL; |
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 036e23251448..625d0ee5e04a 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h | |||
@@ -25,6 +25,8 @@ | |||
25 | #ifndef _EXTENT_MAP_H | 25 | #ifndef _EXTENT_MAP_H |
26 | #define _EXTENT_MAP_H | 26 | #define _EXTENT_MAP_H |
27 | 27 | ||
28 | int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, | ||
29 | u32 *num_clusters); | ||
28 | int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, | 30 | int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, |
29 | int *ret_count); | 31 | int *ret_count); |
30 | 32 | ||
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3bcf3629265e..667e5a869bf5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
34 | #include <linux/pipe_fs_i.h> | 34 | #include <linux/pipe_fs_i.h> |
35 | #include <linux/mount.h> | 35 | #include <linux/mount.h> |
36 | #include <linux/writeback.h> | ||
36 | 37 | ||
37 | #define MLOG_MASK_PREFIX ML_INODE | 38 | #define MLOG_MASK_PREFIX ML_INODE |
38 | #include <cluster/masklog.h> | 39 | #include <cluster/masklog.h> |
@@ -485,13 +486,13 @@ leave: | |||
485 | * accessed, and lock them, reserving the appropriate number of bits. | 486 | * accessed, and lock them, reserving the appropriate number of bits. |
486 | * | 487 | * |
487 | * Called from ocfs2_extend_allocation() for file systems which don't | 488 | * Called from ocfs2_extend_allocation() for file systems which don't |
488 | * support holes, and from ocfs2_prepare_write() for file systems | 489 | * support holes, and from ocfs2_write() for file systems which |
489 | * which understand sparse inodes. | 490 | * understand sparse inodes. |
490 | */ | 491 | */ |
491 | static int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | 492 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, |
492 | u32 clusters_to_add, | 493 | u32 clusters_to_add, |
493 | struct ocfs2_alloc_context **data_ac, | 494 | struct ocfs2_alloc_context **data_ac, |
494 | struct ocfs2_alloc_context **meta_ac) | 495 | struct ocfs2_alloc_context **meta_ac) |
495 | { | 496 | { |
496 | int ret, num_free_extents; | 497 | int ret, num_free_extents; |
497 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 498 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
@@ -518,7 +519,7 @@ static int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | |||
518 | * a cluster lock (because we ran out of room for another | 519 | * a cluster lock (because we ran out of room for another |
519 | * extent) will violate ordering rules. | 520 | * extent) will violate ordering rules. |
520 | * | 521 | * |
521 | * Most of the time we'll only be seeing this 1 page at a time | 522 | * Most of the time we'll only be seeing this 1 cluster at a time |
522 | * anyway. | 523 | * anyway. |
523 | */ | 524 | */ |
524 | if (!num_free_extents || | 525 | if (!num_free_extents || |
@@ -596,13 +597,6 @@ static int ocfs2_extend_allocation(struct inode *inode, | |||
596 | restart_all: | 597 | restart_all: |
597 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | 598 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); |
598 | 599 | ||
599 | status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, | ||
600 | &meta_ac); | ||
601 | if (status) { | ||
602 | mlog_errno(status); | ||
603 | goto leave; | ||
604 | } | ||
605 | |||
606 | /* blocks peope in read/write from reading our allocation | 600 | /* blocks peope in read/write from reading our allocation |
607 | * until we're done changing it. We depend on i_mutex to block | 601 | * until we're done changing it. We depend on i_mutex to block |
608 | * other extend/truncate calls while we're here. Ordering wrt | 602 | * other extend/truncate calls while we're here. Ordering wrt |
@@ -610,6 +604,13 @@ restart_all: | |||
610 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 604 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
611 | drop_alloc_sem = 1; | 605 | drop_alloc_sem = 1; |
612 | 606 | ||
607 | status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, | ||
608 | &meta_ac); | ||
609 | if (status) { | ||
610 | mlog_errno(status); | ||
611 | goto leave; | ||
612 | } | ||
613 | |||
613 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); | 614 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); |
614 | handle = ocfs2_start_trans(osb, credits); | 615 | handle = ocfs2_start_trans(osb, credits); |
615 | if (IS_ERR(handle)) { | 616 | if (IS_ERR(handle)) { |
@@ -1088,10 +1089,49 @@ out: | |||
1088 | return ret; | 1089 | return ret; |
1089 | } | 1090 | } |
1090 | 1091 | ||
1092 | /* | ||
1093 | * Will look for holes and unwritten extents in the range starting at | ||
1094 | * pos for count bytes (inclusive). | ||
1095 | */ | ||
1096 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, | ||
1097 | size_t count) | ||
1098 | { | ||
1099 | int ret = 0; | ||
1100 | unsigned int extent_flags; | ||
1101 | u32 cpos, clusters, extent_len, phys_cpos; | ||
1102 | struct super_block *sb = inode->i_sb; | ||
1103 | |||
1104 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; | ||
1105 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; | ||
1106 | |||
1107 | while (clusters) { | ||
1108 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, | ||
1109 | &extent_flags); | ||
1110 | if (ret < 0) { | ||
1111 | mlog_errno(ret); | ||
1112 | goto out; | ||
1113 | } | ||
1114 | |||
1115 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { | ||
1116 | ret = 1; | ||
1117 | break; | ||
1118 | } | ||
1119 | |||
1120 | if (extent_len > clusters) | ||
1121 | extent_len = clusters; | ||
1122 | |||
1123 | clusters -= extent_len; | ||
1124 | cpos += extent_len; | ||
1125 | } | ||
1126 | out: | ||
1127 | return ret; | ||
1128 | } | ||
1129 | |||
1091 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | 1130 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, |
1092 | loff_t *ppos, | 1131 | loff_t *ppos, |
1093 | size_t count, | 1132 | size_t count, |
1094 | int appending) | 1133 | int appending, |
1134 | int *direct_io) | ||
1095 | { | 1135 | { |
1096 | int ret = 0, meta_level = appending; | 1136 | int ret = 0, meta_level = appending; |
1097 | struct inode *inode = dentry->d_inode; | 1137 | struct inode *inode = dentry->d_inode; |
@@ -1143,12 +1183,47 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | |||
1143 | saved_pos = *ppos; | 1183 | saved_pos = *ppos; |
1144 | } | 1184 | } |
1145 | 1185 | ||
1186 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | ||
1187 | loff_t end = saved_pos + count; | ||
1188 | |||
1189 | /* | ||
1190 | * Skip the O_DIRECT checks if we don't need | ||
1191 | * them. | ||
1192 | */ | ||
1193 | if (!direct_io || !(*direct_io)) | ||
1194 | break; | ||
1195 | |||
1196 | /* | ||
1197 | * Allowing concurrent direct writes means | ||
1198 | * i_size changes wouldn't be synchronized, so | ||
1199 | * one node could wind up truncating another | ||
1200 | * nodes writes. | ||
1201 | */ | ||
1202 | if (end > i_size_read(inode)) { | ||
1203 | *direct_io = 0; | ||
1204 | break; | ||
1205 | } | ||
1206 | |||
1207 | /* | ||
1208 | * We don't fill holes during direct io, so | ||
1209 | * check for them here. If any are found, the | ||
1210 | * caller will have to retake some cluster | ||
1211 | * locks and initiate the io as buffered. | ||
1212 | */ | ||
1213 | ret = ocfs2_check_range_for_holes(inode, saved_pos, | ||
1214 | count); | ||
1215 | if (ret == 1) { | ||
1216 | *direct_io = 0; | ||
1217 | ret = 0; | ||
1218 | } else if (ret < 0) | ||
1219 | mlog_errno(ret); | ||
1220 | break; | ||
1221 | } | ||
1222 | |||
1146 | /* | 1223 | /* |
1147 | * The rest of this loop is concerned with legacy file | 1224 | * The rest of this loop is concerned with legacy file |
1148 | * systems which don't support sparse files. | 1225 | * systems which don't support sparse files. |
1149 | */ | 1226 | */ |
1150 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | ||
1151 | break; | ||
1152 | 1227 | ||
1153 | newsize = count + saved_pos; | 1228 | newsize = count + saved_pos; |
1154 | 1229 | ||
@@ -1202,55 +1277,264 @@ out: | |||
1202 | return ret; | 1277 | return ret; |
1203 | } | 1278 | } |
1204 | 1279 | ||
1280 | static inline void | ||
1281 | ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | ||
1282 | { | ||
1283 | const struct iovec *iov = *iovp; | ||
1284 | size_t base = *basep; | ||
1285 | |||
1286 | do { | ||
1287 | int copy = min(bytes, iov->iov_len - base); | ||
1288 | |||
1289 | bytes -= copy; | ||
1290 | base += copy; | ||
1291 | if (iov->iov_len == base) { | ||
1292 | iov++; | ||
1293 | base = 0; | ||
1294 | } | ||
1295 | } while (bytes); | ||
1296 | *iovp = iov; | ||
1297 | *basep = base; | ||
1298 | } | ||
1299 | |||
1300 | static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, | ||
1301 | const struct iovec *cur_iov, | ||
1302 | size_t iov_offset) | ||
1303 | { | ||
1304 | int ret; | ||
1305 | char *buf; | ||
1306 | struct page *src_page = NULL; | ||
1307 | |||
1308 | buf = cur_iov->iov_base + iov_offset; | ||
1309 | |||
1310 | if (!segment_eq(get_fs(), KERNEL_DS)) { | ||
1311 | /* | ||
1312 | * Pull in the user page. We want to do this outside | ||
1313 | * of the meta data locks in order to preserve locking | ||
1314 | * order in case of page fault. | ||
1315 | */ | ||
1316 | ret = get_user_pages(current, current->mm, | ||
1317 | (unsigned long)buf & PAGE_CACHE_MASK, 1, | ||
1318 | 0, 0, &src_page, NULL); | ||
1319 | if (ret == 1) | ||
1320 | bp->b_src_buf = kmap(src_page); | ||
1321 | else | ||
1322 | src_page = ERR_PTR(-EFAULT); | ||
1323 | } else { | ||
1324 | bp->b_src_buf = buf; | ||
1325 | } | ||
1326 | |||
1327 | return src_page; | ||
1328 | } | ||
1329 | |||
1330 | static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, | ||
1331 | struct page *page) | ||
1332 | { | ||
1333 | if (page) { | ||
1334 | kunmap(page); | ||
1335 | page_cache_release(page); | ||
1336 | } | ||
1337 | } | ||
1338 | |||
1339 | static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | ||
1340 | const struct iovec *iov, | ||
1341 | unsigned long nr_segs, | ||
1342 | size_t count, | ||
1343 | ssize_t o_direct_written) | ||
1344 | { | ||
1345 | int ret = 0; | ||
1346 | ssize_t copied, total = 0; | ||
1347 | size_t iov_offset = 0; | ||
1348 | const struct iovec *cur_iov = iov; | ||
1349 | struct ocfs2_buffered_write_priv bp; | ||
1350 | struct page *page; | ||
1351 | |||
1352 | /* | ||
1353 | * handle partial DIO write. Adjust cur_iov if needed. | ||
1354 | */ | ||
1355 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); | ||
1356 | |||
1357 | do { | ||
1358 | bp.b_cur_off = iov_offset; | ||
1359 | bp.b_cur_iov = cur_iov; | ||
1360 | |||
1361 | page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); | ||
1362 | if (IS_ERR(page)) { | ||
1363 | ret = PTR_ERR(page); | ||
1364 | goto out; | ||
1365 | } | ||
1366 | |||
1367 | copied = ocfs2_buffered_write_cluster(file, *ppos, count, | ||
1368 | ocfs2_map_and_write_user_data, | ||
1369 | &bp); | ||
1370 | |||
1371 | ocfs2_put_write_source(&bp, page); | ||
1372 | |||
1373 | if (copied < 0) { | ||
1374 | mlog_errno(copied); | ||
1375 | ret = copied; | ||
1376 | goto out; | ||
1377 | } | ||
1378 | |||
1379 | total += copied; | ||
1380 | *ppos = *ppos + copied; | ||
1381 | count -= copied; | ||
1382 | |||
1383 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); | ||
1384 | } while(count); | ||
1385 | |||
1386 | out: | ||
1387 | return total ? total : ret; | ||
1388 | } | ||
1389 | |||
1390 | static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted, | ||
1391 | unsigned long *nr_segs) | ||
1392 | { | ||
1393 | size_t ocount; /* original count */ | ||
1394 | unsigned long seg; | ||
1395 | |||
1396 | ocount = 0; | ||
1397 | for (seg = 0; seg < *nr_segs; seg++) { | ||
1398 | const struct iovec *iv = &iov[seg]; | ||
1399 | |||
1400 | /* | ||
1401 | * If any segment has a negative length, or the cumulative | ||
1402 | * length ever wraps negative then return -EINVAL. | ||
1403 | */ | ||
1404 | ocount += iv->iov_len; | ||
1405 | if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) | ||
1406 | return -EINVAL; | ||
1407 | if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) | ||
1408 | continue; | ||
1409 | if (seg == 0) | ||
1410 | return -EFAULT; | ||
1411 | *nr_segs = seg; | ||
1412 | ocount -= iv->iov_len; /* This segment is no good */ | ||
1413 | break; | ||
1414 | } | ||
1415 | |||
1416 | *counted = ocount; | ||
1417 | return 0; | ||
1418 | } | ||
1419 | |||
1205 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | 1420 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, |
1206 | const struct iovec *iov, | 1421 | const struct iovec *iov, |
1207 | unsigned long nr_segs, | 1422 | unsigned long nr_segs, |
1208 | loff_t pos) | 1423 | loff_t pos) |
1209 | { | 1424 | { |
1210 | int ret, rw_level, have_alloc_sem = 0; | 1425 | int ret, direct_io, appending, rw_level, have_alloc_sem = 0; |
1211 | struct file *filp = iocb->ki_filp; | 1426 | int can_do_direct, sync = 0; |
1212 | struct inode *inode = filp->f_path.dentry->d_inode; | 1427 | ssize_t written = 0; |
1213 | int appending = filp->f_flags & O_APPEND ? 1 : 0; | 1428 | size_t ocount; /* original count */ |
1214 | 1429 | size_t count; /* after file limit checks */ | |
1215 | mlog_entry("(0x%p, %u, '%.*s')\n", filp, | 1430 | loff_t *ppos = &iocb->ki_pos; |
1431 | struct file *file = iocb->ki_filp; | ||
1432 | struct inode *inode = file->f_path.dentry->d_inode; | ||
1433 | |||
1434 | mlog_entry("(0x%p, %u, '%.*s')\n", file, | ||
1216 | (unsigned int)nr_segs, | 1435 | (unsigned int)nr_segs, |
1217 | filp->f_path.dentry->d_name.len, | 1436 | file->f_path.dentry->d_name.len, |
1218 | filp->f_path.dentry->d_name.name); | 1437 | file->f_path.dentry->d_name.name); |
1219 | 1438 | ||
1220 | /* happy write of zero bytes */ | ||
1221 | if (iocb->ki_left == 0) | 1439 | if (iocb->ki_left == 0) |
1222 | return 0; | 1440 | return 0; |
1223 | 1441 | ||
1442 | ret = ocfs2_check_iovec(iov, &ocount, &nr_segs); | ||
1443 | if (ret) | ||
1444 | return ret; | ||
1445 | |||
1446 | count = ocount; | ||
1447 | |||
1448 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | ||
1449 | |||
1450 | appending = file->f_flags & O_APPEND ? 1 : 0; | ||
1451 | direct_io = file->f_flags & O_DIRECT ? 1 : 0; | ||
1452 | |||
1224 | mutex_lock(&inode->i_mutex); | 1453 | mutex_lock(&inode->i_mutex); |
1454 | |||
1455 | relock: | ||
1225 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ | 1456 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ |
1226 | if (filp->f_flags & O_DIRECT) { | 1457 | if (direct_io) { |
1227 | have_alloc_sem = 1; | ||
1228 | down_read(&inode->i_alloc_sem); | 1458 | down_read(&inode->i_alloc_sem); |
1459 | have_alloc_sem = 1; | ||
1229 | } | 1460 | } |
1230 | 1461 | ||
1231 | /* concurrent O_DIRECT writes are allowed */ | 1462 | /* concurrent O_DIRECT writes are allowed */ |
1232 | rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; | 1463 | rw_level = !direct_io; |
1233 | ret = ocfs2_rw_lock(inode, rw_level); | 1464 | ret = ocfs2_rw_lock(inode, rw_level); |
1234 | if (ret < 0) { | 1465 | if (ret < 0) { |
1235 | rw_level = -1; | ||
1236 | mlog_errno(ret); | 1466 | mlog_errno(ret); |
1237 | goto out; | 1467 | goto out_sems; |
1238 | } | 1468 | } |
1239 | 1469 | ||
1240 | ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, | 1470 | can_do_direct = direct_io; |
1241 | iocb->ki_left, appending); | 1471 | ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, |
1472 | iocb->ki_left, appending, | ||
1473 | &can_do_direct); | ||
1242 | if (ret < 0) { | 1474 | if (ret < 0) { |
1243 | mlog_errno(ret); | 1475 | mlog_errno(ret); |
1244 | goto out; | 1476 | goto out; |
1245 | } | 1477 | } |
1246 | 1478 | ||
1479 | /* | ||
1480 | * We can't complete the direct I/O as requested, fall back to | ||
1481 | * buffered I/O. | ||
1482 | */ | ||
1483 | if (direct_io && !can_do_direct) { | ||
1484 | ocfs2_rw_unlock(inode, rw_level); | ||
1485 | up_read(&inode->i_alloc_sem); | ||
1486 | |||
1487 | have_alloc_sem = 0; | ||
1488 | rw_level = -1; | ||
1489 | |||
1490 | direct_io = 0; | ||
1491 | sync = 1; | ||
1492 | goto relock; | ||
1493 | } | ||
1494 | |||
1495 | if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) | ||
1496 | sync = 1; | ||
1497 | |||
1498 | /* | ||
1499 | * XXX: Is it ok to execute these checks a second time? | ||
1500 | */ | ||
1501 | ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); | ||
1502 | if (ret) | ||
1503 | goto out; | ||
1504 | |||
1505 | /* | ||
1506 | * Set pos so that sync_page_range_nolock() below understands | ||
1507 | * where to start from. We might've moved it around via the | ||
1508 | * calls above. The range we want to actually sync starts from | ||
1509 | * *ppos here. | ||
1510 | * | ||
1511 | */ | ||
1512 | pos = *ppos; | ||
1513 | |||
1247 | /* communicate with ocfs2_dio_end_io */ | 1514 | /* communicate with ocfs2_dio_end_io */ |
1248 | ocfs2_iocb_set_rw_locked(iocb); | 1515 | ocfs2_iocb_set_rw_locked(iocb); |
1249 | 1516 | ||
1250 | ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); | 1517 | if (direct_io) { |
1518 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, | ||
1519 | ppos, count, ocount); | ||
1520 | if (written < 0) { | ||
1521 | ret = written; | ||
1522 | goto out_dio; | ||
1523 | } | ||
1524 | } else { | ||
1525 | written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, | ||
1526 | count, written); | ||
1527 | if (written < 0) { | ||
1528 | ret = written; | ||
1529 | if (ret != -EFAULT || ret != -ENOSPC) | ||
1530 | mlog_errno(ret); | ||
1531 | goto out; | ||
1532 | } | ||
1533 | } | ||
1251 | 1534 | ||
1535 | out_dio: | ||
1252 | /* buffered aio wouldn't have proper lock coverage today */ | 1536 | /* buffered aio wouldn't have proper lock coverage today */ |
1253 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | 1537 | BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); |
1254 | 1538 | ||
1255 | /* | 1539 | /* |
1256 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | 1540 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io |
@@ -1268,14 +1552,25 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
1268 | } | 1552 | } |
1269 | 1553 | ||
1270 | out: | 1554 | out: |
1555 | if (rw_level != -1) | ||
1556 | ocfs2_rw_unlock(inode, rw_level); | ||
1557 | |||
1558 | out_sems: | ||
1271 | if (have_alloc_sem) | 1559 | if (have_alloc_sem) |
1272 | up_read(&inode->i_alloc_sem); | 1560 | up_read(&inode->i_alloc_sem); |
1273 | if (rw_level != -1) | 1561 | |
1274 | ocfs2_rw_unlock(inode, rw_level); | 1562 | if (written > 0 && sync) { |
1563 | ssize_t err; | ||
1564 | |||
1565 | err = sync_page_range_nolock(inode, file->f_mapping, pos, count); | ||
1566 | if (err < 0) | ||
1567 | written = err; | ||
1568 | } | ||
1569 | |||
1275 | mutex_unlock(&inode->i_mutex); | 1570 | mutex_unlock(&inode->i_mutex); |
1276 | 1571 | ||
1277 | mlog_exit(ret); | 1572 | mlog_exit(ret); |
1278 | return ret; | 1573 | return written ? written : ret; |
1279 | } | 1574 | } |
1280 | 1575 | ||
1281 | static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, | 1576 | static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, |
@@ -1300,7 +1595,8 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, | |||
1300 | goto out; | 1595 | goto out; |
1301 | } | 1596 | } |
1302 | 1597 | ||
1303 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); | 1598 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, |
1599 | NULL); | ||
1304 | if (ret < 0) { | 1600 | if (ret < 0) { |
1305 | mlog_errno(ret); | 1601 | mlog_errno(ret); |
1306 | goto out_unlock; | 1602 | goto out_unlock; |
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index e2f6551604d0..2c4460fced52 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h | |||
@@ -46,6 +46,10 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
46 | struct ocfs2_alloc_context *data_ac, | 46 | struct ocfs2_alloc_context *data_ac, |
47 | struct ocfs2_alloc_context *meta_ac, | 47 | struct ocfs2_alloc_context *meta_ac, |
48 | enum ocfs2_alloc_restarted *reason); | 48 | enum ocfs2_alloc_restarted *reason); |
49 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | ||
50 | u32 clusters_to_add, | ||
51 | struct ocfs2_alloc_context **data_ac, | ||
52 | struct ocfs2_alloc_context **meta_ac); | ||
49 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); | 53 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); |
50 | int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, | 54 | int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, |
51 | struct kstat *stat); | 55 | struct kstat *stat); |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index faeb53f2eecf..2699f7cac21a 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -463,6 +463,38 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) | |||
463 | return (unsigned long)((bytes + 511) >> 9); | 463 | return (unsigned long)((bytes + 511) >> 9); |
464 | } | 464 | } |
465 | 465 | ||
466 | static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, | ||
467 | unsigned long pg_index) | ||
468 | { | ||
469 | u32 clusters = pg_index; | ||
470 | unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; | ||
471 | |||
472 | if (unlikely(PAGE_CACHE_SHIFT > cbits)) | ||
473 | clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); | ||
474 | else if (PAGE_CACHE_SHIFT < cbits) | ||
475 | clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); | ||
476 | |||
477 | return clusters; | ||
478 | } | ||
479 | |||
480 | /* | ||
481 | * Find the 1st page index which covers the given clusters. | ||
482 | */ | ||
483 | static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb, | ||
484 | u32 clusters) | ||
485 | { | ||
486 | unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; | ||
487 | unsigned long index = clusters; | ||
488 | |||
489 | if (PAGE_CACHE_SHIFT > cbits) { | ||
490 | index = clusters >> (PAGE_CACHE_SHIFT - cbits); | ||
491 | } else if (PAGE_CACHE_SHIFT < cbits) { | ||
492 | index = clusters << (cbits - PAGE_CACHE_SHIFT); | ||
493 | } | ||
494 | |||
495 | return index; | ||
496 | } | ||
497 | |||
466 | #define ocfs2_set_bit ext2_set_bit | 498 | #define ocfs2_set_bit ext2_set_bit |
467 | #define ocfs2_clear_bit ext2_clear_bit | 499 | #define ocfs2_clear_bit ext2_clear_bit |
468 | #define ocfs2_test_bit ext2_test_bit | 500 | #define ocfs2_test_bit ext2_test_bit |