aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/file.c
diff options
context:
space:
mode:
authorJoel Becker <joel.becker@oracle.com>2010-07-01 18:13:31 -0400
committerJoel Becker <joel.becker@oracle.com>2010-07-08 16:25:35 -0400
commit5693486bad2bc2ac585a2c24f7e2f3964b478df9 (patch)
tree03d61d72c1b73bbf0b049bf0328f8e0c69f35a43 /fs/ocfs2/file.c
parenta4bfb4cf11fd2211b788af59dc8a8b4394bca227 (diff)
ocfs2: Zero the tail cluster when extending past i_size.
ocfs2's allocation unit is the cluster. This can be larger than a block or even a memory page. This means that a file may have many blocks in its last extent that are beyond the block containing i_size. There also may be more unwritten extents after that. When ocfs2 grows a file, it zeros the entire cluster in order to ensure future i_size growth will see cleared blocks. Unfortunately, block_write_full_page() drops the pages past i_size. This means that ocfs2 is actually leaking garbage data into the tail end of that last cluster. This is a bug. We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect when a write or truncate is past i_size. They will use ocfs2_zero_extend() to ensure the data is properly zeroed. Older versions of ocfs2_zero_extend() simply zeroed every block between i_size and the zeroing position. This presumes three things: 1) There is allocation for all of these blocks. 2) The extents are not unwritten. 3) The extents are not refcounted. (1) and (2) hold true for non-sparse filesystems, which used to be the only users of ocfs2_zero_extend(). (3) is another bug. Since we're now using ocfs2_zero_extend() for sparse filesystems as well, we teach ocfs2_zero_extend() to check every extent between i_size and the zeroing position. If the extent is unwritten, it is ignored. If it is refcounted, it is CoWed. Then it is zeroed. Signed-off-by: Joel Becker <joel.becker@oracle.com> Cc: stable@kernel.org
Diffstat (limited to 'fs/ocfs2/file.c')
-rw-r--r--fs/ocfs2/file.c201
1 files changed, 166 insertions, 35 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4cfc976a9067..ac15911b31c4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -787,6 +787,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
787 if (!zero_to) 787 if (!zero_to)
788 zero_to = PAGE_CACHE_SIZE; 788 zero_to = PAGE_CACHE_SIZE;
789 789
790 mlog(0,
791 "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
792 (unsigned long long)abs_from, (unsigned long long)abs_to,
793 index, zero_from, zero_to);
794
790 /* We know that zero_from is block aligned */ 795 /* We know that zero_from is block aligned */
791 for (block_start = zero_from; block_start < zero_to; 796 for (block_start = zero_from; block_start < zero_to;
792 block_start = block_end) { 797 block_start = block_end) {
@@ -833,25 +838,114 @@ out:
833 return ret; 838 return ret;
834} 839}
835 840
836static int ocfs2_zero_extend(struct inode *inode, 841/*
837 u64 zero_to_size) 842 * Find the next range to zero. We do this in terms of bytes because
843 * that's what ocfs2_zero_extend() wants, and it is dealing with the
844 * pagecache. We may return multiple extents.
845 *
846 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
847 * needs to be zeroed. range_start and range_end return the next zeroing
848 * range. A subsequent call should pass the previous range_end as its
849 * zero_start. If range_end is 0, there's nothing to do.
850 *
851 * Unwritten extents are skipped over. Refcounted extents are CoWd.
852 */
853static int ocfs2_zero_extend_get_range(struct inode *inode,
854 struct buffer_head *di_bh,
855 u64 zero_start, u64 zero_end,
856 u64 *range_start, u64 *range_end)
838{ 857{
839 int ret = 0; 858 int rc = 0, needs_cow = 0;
840 u64 start_off, next_off; 859 u32 p_cpos, zero_clusters = 0;
841 struct super_block *sb = inode->i_sb; 860 u32 zero_cpos =
861 zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
862 u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
863 unsigned int num_clusters = 0;
864 unsigned int ext_flags = 0;
842 865
843 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 866 while (zero_cpos < last_cpos) {
844 while (start_off < zero_to_size) { 867 rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
845 next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; 868 &num_clusters, &ext_flags);
846 if (next_off > zero_to_size) 869 if (rc) {
847 next_off = zero_to_size; 870 mlog_errno(rc);
848 ret = ocfs2_write_zero_page(inode, start_off, next_off); 871 goto out;
849 if (ret < 0) { 872 }
850 mlog_errno(ret); 873
874 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
875 zero_clusters = num_clusters;
876 if (ext_flags & OCFS2_EXT_REFCOUNTED)
877 needs_cow = 1;
878 break;
879 }
880
881 zero_cpos += num_clusters;
882 }
883 if (!zero_clusters) {
884 *range_end = 0;
885 goto out;
886 }
887
888 while ((zero_cpos + zero_clusters) < last_cpos) {
889 rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
890 &p_cpos, &num_clusters,
891 &ext_flags);
892 if (rc) {
893 mlog_errno(rc);
894 goto out;
895 }
896
897 if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
898 break;
899 if (ext_flags & OCFS2_EXT_REFCOUNTED)
900 needs_cow = 1;
901 zero_clusters += num_clusters;
902 }
903 if ((zero_cpos + zero_clusters) > last_cpos)
904 zero_clusters = last_cpos - zero_cpos;
905
906 if (needs_cow) {
907 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
908 UINT_MAX);
909 if (rc) {
910 mlog_errno(rc);
851 goto out; 911 goto out;
852 } 912 }
913 }
853 914
854 start_off = next_off; 915 *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
916 *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
917 zero_cpos + zero_clusters);
918
919out:
920 return rc;
921}
922
923/*
924 * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
925 * has made sure that the entire range needs zeroing.
926 */
927static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
928 u64 range_end)
929{
930 int rc = 0;
931 u64 next_pos;
932 u64 zero_pos = range_start;
933
934 mlog(0, "range_start = %llu, range_end = %llu\n",
935 (unsigned long long)range_start,
936 (unsigned long long)range_end);
937 BUG_ON(range_start >= range_end);
938
939 while (zero_pos < range_end) {
940 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
941 if (next_pos > range_end)
942 next_pos = range_end;
943 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
944 if (rc < 0) {
945 mlog_errno(rc);
946 break;
947 }
948 zero_pos = next_pos;
855 949
856 /* 950 /*
857 * Very large extends have the potential to lock up 951 * Very large extends have the potential to lock up
@@ -860,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
860 cond_resched(); 954 cond_resched();
861 } 955 }
862 956
863out: 957 return rc;
958}
959
960int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
961 loff_t zero_to_size)
962{
963 int ret = 0;
964 u64 zero_start, range_start = 0, range_end = 0;
965 struct super_block *sb = inode->i_sb;
966
967 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
968 mlog(0, "zero_start %llu for i_size %llu\n",
969 (unsigned long long)zero_start,
970 (unsigned long long)i_size_read(inode));
971 while (zero_start < zero_to_size) {
972 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
973 zero_to_size,
974 &range_start,
975 &range_end);
976 if (ret) {
977 mlog_errno(ret);
978 break;
979 }
980 if (!range_end)
981 break;
982 /* Trim the ends */
983 if (range_start < zero_start)
984 range_start = zero_start;
985 if (range_end > zero_to_size)
986 range_end = zero_to_size;
987
988 ret = ocfs2_zero_extend_range(inode, range_start,
989 range_end);
990 if (ret) {
991 mlog_errno(ret);
992 break;
993 }
994 zero_start = range_end;
995 }
996
864 return ret; 997 return ret;
865} 998}
866 999
867int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) 1000int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
1001 u64 new_i_size, u64 zero_to)
868{ 1002{
869 int ret; 1003 int ret;
870 u32 clusters_to_add; 1004 u32 clusters_to_add;
871 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1005 struct ocfs2_inode_info *oi = OCFS2_I(inode);
872 1006
1007 /*
1008 * Only quota files call this without a bh, and they can't be
1009 * refcounted.
1010 */
1011 BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
1012 BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1013
873 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); 1014 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
874 if (clusters_to_add < oi->ip_clusters) 1015 if (clusters_to_add < oi->ip_clusters)
875 clusters_to_add = 0; 1016 clusters_to_add = 0;
@@ -890,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
890 * still need to zero the area between the old i_size and the 1031 * still need to zero the area between the old i_size and the
891 * new i_size. 1032 * new i_size.
892 */ 1033 */
893 ret = ocfs2_zero_extend(inode, zero_to); 1034 ret = ocfs2_zero_extend(inode, di_bh, zero_to);
894 if (ret < 0) 1035 if (ret < 0)
895 mlog_errno(ret); 1036 mlog_errno(ret);
896 1037
@@ -912,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode,
912 goto out; 1053 goto out;
913 1054
914 if (i_size_read(inode) == new_i_size) 1055 if (i_size_read(inode) == new_i_size)
915 goto out; 1056 goto out;
916 BUG_ON(new_i_size < i_size_read(inode)); 1057 BUG_ON(new_i_size < i_size_read(inode));
917 1058
918 /* 1059 /*
919 * Fall through for converting inline data, even if the fs
920 * supports sparse files.
921 *
922 * The check for inline data here is legal - nobody can add
923 * the feature since we have i_mutex. We must check it again
924 * after acquiring ip_alloc_sem though, as paths like mmap
925 * might have raced us to converting the inode to extents.
926 */
927 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
928 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
929 goto out_update_size;
930
931 /*
932 * The alloc sem blocks people in read/write from reading our 1060 * The alloc sem blocks people in read/write from reading our
933 * allocation until we're done changing it. We depend on 1061 * allocation until we're done changing it. We depend on
934 * i_mutex to block other extend/truncate calls while we're 1062 * i_mutex to block other extend/truncate calls while we're
935 * here. 1063 * here. We even have to hold it for sparse files because there
1064 * might be some tail zeroing.
936 */ 1065 */
937 down_write(&oi->ip_alloc_sem); 1066 down_write(&oi->ip_alloc_sem);
938 1067
@@ -949,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
949 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1078 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
950 if (ret) { 1079 if (ret) {
951 up_write(&oi->ip_alloc_sem); 1080 up_write(&oi->ip_alloc_sem);
952
953 mlog_errno(ret); 1081 mlog_errno(ret);
954 goto out; 1082 goto out;
955 } 1083 }
956 } 1084 }
957 1085
958 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 1086 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
959 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); 1087 ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1088 else
1089 ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1090 new_i_size);
960 1091
961 up_write(&oi->ip_alloc_sem); 1092 up_write(&oi->ip_alloc_sem);
962 1093