diff options
author | Joel Becker <joel.becker@oracle.com> | 2010-07-01 18:13:31 -0400 |
---|---|---|
committer | Joel Becker <joel.becker@oracle.com> | 2010-07-08 16:25:35 -0400 |
commit | 5693486bad2bc2ac585a2c24f7e2f3964b478df9 (patch) | |
tree | 03d61d72c1b73bbf0b049bf0328f8e0c69f35a43 /fs/ocfs2/file.c | |
parent | a4bfb4cf11fd2211b788af59dc8a8b4394bca227 (diff) |
ocfs2: Zero the tail cluster when extending past i_size.
ocfs2's allocation unit is the cluster. This can be larger than a block
or even a memory page. This means that a file may have many blocks in
its last extent that are beyond the block containing i_size. There also
may be more unwritten extents after that.
When ocfs2 grows a file, it zeros the entire cluster in order to ensure
future i_size growth will see cleared blocks. Unfortunately,
block_write_full_page() drops the pages past i_size. This means that
ocfs2 is actually leaking garbage data into the tail end of that last
cluster. This is a bug.
We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
when a write or truncate is past i_size. They will use
ocfs2_zero_extend() to ensure the data is properly zeroed.
Older versions of ocfs2_zero_extend() simply zeroed every block between
i_size and the zeroing position. This presumes three things:
1) There is allocation for all of these blocks.
2) The extents are not unwritten.
3) The extents are not refcounted.
(1) and (2) hold true for non-sparse filesystems, which used to be the
only users of ocfs2_zero_extend(). (3) is another bug.
Since we're now using ocfs2_zero_extend() for sparse filesystems as
well, we teach ocfs2_zero_extend() to check every extent between
i_size and the zeroing position. If the extent is unwritten, it is
ignored. If it is refcounted, it is CoWed. Then it is zeroed.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Cc: stable@kernel.org
Diffstat (limited to 'fs/ocfs2/file.c')
-rw-r--r-- | fs/ocfs2/file.c | 201 |
1 files changed, 166 insertions, 35 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4cfc976a9067..ac15911b31c4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -787,6 +787,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, | |||
787 | if (!zero_to) | 787 | if (!zero_to) |
788 | zero_to = PAGE_CACHE_SIZE; | 788 | zero_to = PAGE_CACHE_SIZE; |
789 | 789 | ||
790 | mlog(0, | ||
791 | "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n", | ||
792 | (unsigned long long)abs_from, (unsigned long long)abs_to, | ||
793 | index, zero_from, zero_to); | ||
794 | |||
790 | /* We know that zero_from is block aligned */ | 795 | /* We know that zero_from is block aligned */ |
791 | for (block_start = zero_from; block_start < zero_to; | 796 | for (block_start = zero_from; block_start < zero_to; |
792 | block_start = block_end) { | 797 | block_start = block_end) { |
@@ -833,25 +838,114 @@ out: | |||
833 | return ret; | 838 | return ret; |
834 | } | 839 | } |
835 | 840 | ||
836 | static int ocfs2_zero_extend(struct inode *inode, | 841 | /* |
837 | u64 zero_to_size) | 842 | * Find the next range to zero. We do this in terms of bytes because |
843 | * that's what ocfs2_zero_extend() wants, and it is dealing with the | ||
844 | * pagecache. We may return multiple extents. | ||
845 | * | ||
846 | * zero_start and zero_end are ocfs2_zero_extend()s current idea of what | ||
847 | * needs to be zeroed. range_start and range_end return the next zeroing | ||
848 | * range. A subsequent call should pass the previous range_end as its | ||
849 | * zero_start. If range_end is 0, there's nothing to do. | ||
850 | * | ||
851 | * Unwritten extents are skipped over. Refcounted extents are CoWd. | ||
852 | */ | ||
853 | static int ocfs2_zero_extend_get_range(struct inode *inode, | ||
854 | struct buffer_head *di_bh, | ||
855 | u64 zero_start, u64 zero_end, | ||
856 | u64 *range_start, u64 *range_end) | ||
838 | { | 857 | { |
839 | int ret = 0; | 858 | int rc = 0, needs_cow = 0; |
840 | u64 start_off, next_off; | 859 | u32 p_cpos, zero_clusters = 0; |
841 | struct super_block *sb = inode->i_sb; | 860 | u32 zero_cpos = |
861 | zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; | ||
862 | u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end); | ||
863 | unsigned int num_clusters = 0; | ||
864 | unsigned int ext_flags = 0; | ||
842 | 865 | ||
843 | start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); | 866 | while (zero_cpos < last_cpos) { |
844 | while (start_off < zero_to_size) { | 867 | rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos, |
845 | next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; | 868 | &num_clusters, &ext_flags); |
846 | if (next_off > zero_to_size) | 869 | if (rc) { |
847 | next_off = zero_to_size; | 870 | mlog_errno(rc); |
848 | ret = ocfs2_write_zero_page(inode, start_off, next_off); | 871 | goto out; |
849 | if (ret < 0) { | 872 | } |
850 | mlog_errno(ret); | 873 | |
874 | if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { | ||
875 | zero_clusters = num_clusters; | ||
876 | if (ext_flags & OCFS2_EXT_REFCOUNTED) | ||
877 | needs_cow = 1; | ||
878 | break; | ||
879 | } | ||
880 | |||
881 | zero_cpos += num_clusters; | ||
882 | } | ||
883 | if (!zero_clusters) { | ||
884 | *range_end = 0; | ||
885 | goto out; | ||
886 | } | ||
887 | |||
888 | while ((zero_cpos + zero_clusters) < last_cpos) { | ||
889 | rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters, | ||
890 | &p_cpos, &num_clusters, | ||
891 | &ext_flags); | ||
892 | if (rc) { | ||
893 | mlog_errno(rc); | ||
894 | goto out; | ||
895 | } | ||
896 | |||
897 | if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
898 | break; | ||
899 | if (ext_flags & OCFS2_EXT_REFCOUNTED) | ||
900 | needs_cow = 1; | ||
901 | zero_clusters += num_clusters; | ||
902 | } | ||
903 | if ((zero_cpos + zero_clusters) > last_cpos) | ||
904 | zero_clusters = last_cpos - zero_cpos; | ||
905 | |||
906 | if (needs_cow) { | ||
907 | rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, | ||
908 | UINT_MAX); | ||
909 | if (rc) { | ||
910 | mlog_errno(rc); | ||
851 | goto out; | 911 | goto out; |
852 | } | 912 | } |
913 | } | ||
853 | 914 | ||
854 | start_off = next_off; | 915 | *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos); |
916 | *range_end = ocfs2_clusters_to_bytes(inode->i_sb, | ||
917 | zero_cpos + zero_clusters); | ||
918 | |||
919 | out: | ||
920 | return rc; | ||
921 | } | ||
922 | |||
923 | /* | ||
924 | * Zero one range returned from ocfs2_zero_extend_get_range(). The caller | ||
925 | * has made sure that the entire range needs zeroing. | ||
926 | */ | ||
927 | static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, | ||
928 | u64 range_end) | ||
929 | { | ||
930 | int rc = 0; | ||
931 | u64 next_pos; | ||
932 | u64 zero_pos = range_start; | ||
933 | |||
934 | mlog(0, "range_start = %llu, range_end = %llu\n", | ||
935 | (unsigned long long)range_start, | ||
936 | (unsigned long long)range_end); | ||
937 | BUG_ON(range_start >= range_end); | ||
938 | |||
939 | while (zero_pos < range_end) { | ||
940 | next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; | ||
941 | if (next_pos > range_end) | ||
942 | next_pos = range_end; | ||
943 | rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); | ||
944 | if (rc < 0) { | ||
945 | mlog_errno(rc); | ||
946 | break; | ||
947 | } | ||
948 | zero_pos = next_pos; | ||
855 | 949 | ||
856 | /* | 950 | /* |
857 | * Very large extends have the potential to lock up | 951 | * Very large extends have the potential to lock up |
@@ -860,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode, | |||
860 | cond_resched(); | 954 | cond_resched(); |
861 | } | 955 | } |
862 | 956 | ||
863 | out: | 957 | return rc; |
958 | } | ||
959 | |||
960 | int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, | ||
961 | loff_t zero_to_size) | ||
962 | { | ||
963 | int ret = 0; | ||
964 | u64 zero_start, range_start = 0, range_end = 0; | ||
965 | struct super_block *sb = inode->i_sb; | ||
966 | |||
967 | zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); | ||
968 | mlog(0, "zero_start %llu for i_size %llu\n", | ||
969 | (unsigned long long)zero_start, | ||
970 | (unsigned long long)i_size_read(inode)); | ||
971 | while (zero_start < zero_to_size) { | ||
972 | ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, | ||
973 | zero_to_size, | ||
974 | &range_start, | ||
975 | &range_end); | ||
976 | if (ret) { | ||
977 | mlog_errno(ret); | ||
978 | break; | ||
979 | } | ||
980 | if (!range_end) | ||
981 | break; | ||
982 | /* Trim the ends */ | ||
983 | if (range_start < zero_start) | ||
984 | range_start = zero_start; | ||
985 | if (range_end > zero_to_size) | ||
986 | range_end = zero_to_size; | ||
987 | |||
988 | ret = ocfs2_zero_extend_range(inode, range_start, | ||
989 | range_end); | ||
990 | if (ret) { | ||
991 | mlog_errno(ret); | ||
992 | break; | ||
993 | } | ||
994 | zero_start = range_end; | ||
995 | } | ||
996 | |||
864 | return ret; | 997 | return ret; |
865 | } | 998 | } |
866 | 999 | ||
867 | int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) | 1000 | int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, |
1001 | u64 new_i_size, u64 zero_to) | ||
868 | { | 1002 | { |
869 | int ret; | 1003 | int ret; |
870 | u32 clusters_to_add; | 1004 | u32 clusters_to_add; |
871 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 1005 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
872 | 1006 | ||
1007 | /* | ||
1008 | * Only quota files call this without a bh, and they can't be | ||
1009 | * refcounted. | ||
1010 | */ | ||
1011 | BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); | ||
1012 | BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); | ||
1013 | |||
873 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); | 1014 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); |
874 | if (clusters_to_add < oi->ip_clusters) | 1015 | if (clusters_to_add < oi->ip_clusters) |
875 | clusters_to_add = 0; | 1016 | clusters_to_add = 0; |
@@ -890,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) | |||
890 | * still need to zero the area between the old i_size and the | 1031 | * still need to zero the area between the old i_size and the |
891 | * new i_size. | 1032 | * new i_size. |
892 | */ | 1033 | */ |
893 | ret = ocfs2_zero_extend(inode, zero_to); | 1034 | ret = ocfs2_zero_extend(inode, di_bh, zero_to); |
894 | if (ret < 0) | 1035 | if (ret < 0) |
895 | mlog_errno(ret); | 1036 | mlog_errno(ret); |
896 | 1037 | ||
@@ -912,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode, | |||
912 | goto out; | 1053 | goto out; |
913 | 1054 | ||
914 | if (i_size_read(inode) == new_i_size) | 1055 | if (i_size_read(inode) == new_i_size) |
915 | goto out; | 1056 | goto out; |
916 | BUG_ON(new_i_size < i_size_read(inode)); | 1057 | BUG_ON(new_i_size < i_size_read(inode)); |
917 | 1058 | ||
918 | /* | 1059 | /* |
919 | * Fall through for converting inline data, even if the fs | ||
920 | * supports sparse files. | ||
921 | * | ||
922 | * The check for inline data here is legal - nobody can add | ||
923 | * the feature since we have i_mutex. We must check it again | ||
924 | * after acquiring ip_alloc_sem though, as paths like mmap | ||
925 | * might have raced us to converting the inode to extents. | ||
926 | */ | ||
927 | if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) | ||
928 | && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | ||
929 | goto out_update_size; | ||
930 | |||
931 | /* | ||
932 | * The alloc sem blocks people in read/write from reading our | 1060 | * The alloc sem blocks people in read/write from reading our |
933 | * allocation until we're done changing it. We depend on | 1061 | * allocation until we're done changing it. We depend on |
934 | * i_mutex to block other extend/truncate calls while we're | 1062 | * i_mutex to block other extend/truncate calls while we're |
935 | * here. | 1063 | * here. We even have to hold it for sparse files because there |
1064 | * might be some tail zeroing. | ||
936 | */ | 1065 | */ |
937 | down_write(&oi->ip_alloc_sem); | 1066 | down_write(&oi->ip_alloc_sem); |
938 | 1067 | ||
@@ -949,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode, | |||
949 | ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); | 1078 | ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); |
950 | if (ret) { | 1079 | if (ret) { |
951 | up_write(&oi->ip_alloc_sem); | 1080 | up_write(&oi->ip_alloc_sem); |
952 | |||
953 | mlog_errno(ret); | 1081 | mlog_errno(ret); |
954 | goto out; | 1082 | goto out; |
955 | } | 1083 | } |
956 | } | 1084 | } |
957 | 1085 | ||
958 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | 1086 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) |
959 | ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); | 1087 | ret = ocfs2_zero_extend(inode, di_bh, new_i_size); |
1088 | else | ||
1089 | ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size, | ||
1090 | new_i_size); | ||
960 | 1091 | ||
961 | up_write(&oi->ip_alloc_sem); | 1092 | up_write(&oi->ip_alloc_sem); |
962 | 1093 | ||