aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoel Becker <joel.becker@oracle.com>2010-07-01 18:13:31 -0400
committerJoel Becker <joel.becker@oracle.com>2010-07-08 16:25:35 -0400
commit5693486bad2bc2ac585a2c24f7e2f3964b478df9 (patch)
tree03d61d72c1b73bbf0b049bf0328f8e0c69f35a43
parenta4bfb4cf11fd2211b788af59dc8a8b4394bca227 (diff)
ocfs2: Zero the tail cluster when extending past i_size.
ocfs2's allocation unit is the cluster. This can be larger than a block or even a memory page. This means that a file may have many blocks in its last extent that are beyond the block containing i_size. There also may be more unwritten extents after that. When ocfs2 grows a file, it zeros the entire cluster in order to ensure future i_size growth will see cleared blocks. Unfortunately, block_write_full_page() drops the pages past i_size. This means that ocfs2 is actually leaking garbage data into the tail end of that last cluster. This is a bug. We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect when a write or truncate is past i_size. They will use ocfs2_zero_extend() to ensure the data is properly zeroed. Older versions of ocfs2_zero_extend() simply zeroed every block between i_size and the zeroing position. This presumes three things: 1) There is allocation for all of these blocks. 2) The extents are not unwritten. 3) The extents are not refcounted. (1) and (2) hold true for non-sparse filesystems, which used to be the only users of ocfs2_zero_extend(). (3) is another bug. Since we're now using ocfs2_zero_extend() for sparse filesystems as well, we teach ocfs2_zero_extend() to check every extent between i_size and the zeroing position. If the extent is unwritten, it is ignored. If it is refcounted, it is CoWed. Then it is zeroed. Signed-off-by: Joel Becker <joel.becker@oracle.com> Cc: stable@kernel.org
-rw-r--r--fs/ocfs2/aops.c42
-rw-r--r--fs/ocfs2/file.c201
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/quota_local.c4
-rw-r--r--fs/ocfs2/refcounttree.c6
6 files changed, 207 insertions, 54 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9a5c931439bd..742893ea7390 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
196 dump_stack(); 196 dump_stack();
197 goto bail; 197 goto bail;
198 } 198 }
199
200 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
201 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
202 (unsigned long long)past_eof);
203
204 if (create && (iblock >= past_eof))
205 set_buffer_new(bh_result);
206 } 199 }
207 200
201 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
202 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
203 (unsigned long long)past_eof);
204 if (create && (iblock >= past_eof))
205 set_buffer_new(bh_result);
206
208bail: 207bail:
209 if (err < 0) 208 if (err < 0)
210 err = -EIO; 209 err = -EIO;
@@ -1590,21 +1589,20 @@ out:
1590 * write path can treat it as an non-allocating write, which has no 1589 * write path can treat it as an non-allocating write, which has no
1591 * special case code for sparse/nonsparse files. 1590 * special case code for sparse/nonsparse files.
1592 */ 1591 */
1593static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, 1592static int ocfs2_expand_nonsparse_inode(struct inode *inode,
1594 unsigned len, 1593 struct buffer_head *di_bh,
1594 loff_t pos, unsigned len,
1595 struct ocfs2_write_ctxt *wc) 1595 struct ocfs2_write_ctxt *wc)
1596{ 1596{
1597 int ret; 1597 int ret;
1598 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1599 loff_t newsize = pos + len; 1598 loff_t newsize = pos + len;
1600 1599
1601 if (ocfs2_sparse_alloc(osb)) 1600 BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1602 return 0;
1603 1601
1604 if (newsize <= i_size_read(inode)) 1602 if (newsize <= i_size_read(inode))
1605 return 0; 1603 return 0;
1606 1604
1607 ret = ocfs2_extend_no_holes(inode, newsize, pos); 1605 ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
1608 if (ret) 1606 if (ret)
1609 mlog_errno(ret); 1607 mlog_errno(ret);
1610 1608
@@ -1614,6 +1612,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
1614 return ret; 1612 return ret;
1615} 1613}
1616 1614
1615static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1616 loff_t pos)
1617{
1618 int ret = 0;
1619
1620 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1621 if (pos > i_size_read(inode))
1622 ret = ocfs2_zero_extend(inode, di_bh, pos);
1623
1624 return ret;
1625}
1626
1617int ocfs2_write_begin_nolock(struct address_space *mapping, 1627int ocfs2_write_begin_nolock(struct address_space *mapping,
1618 loff_t pos, unsigned len, unsigned flags, 1628 loff_t pos, unsigned len, unsigned flags,
1619 struct page **pagep, void **fsdata, 1629 struct page **pagep, void **fsdata,
@@ -1649,7 +1659,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1649 } 1659 }
1650 } 1660 }
1651 1661
1652 ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); 1662 if (ocfs2_sparse_alloc(osb))
1663 ret = ocfs2_zero_tail(inode, di_bh, pos);
1664 else
1665 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
1666 wc);
1653 if (ret) { 1667 if (ret) {
1654 mlog_errno(ret); 1668 mlog_errno(ret);
1655 goto out; 1669 goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4cfc976a9067..ac15911b31c4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -787,6 +787,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
787 if (!zero_to) 787 if (!zero_to)
788 zero_to = PAGE_CACHE_SIZE; 788 zero_to = PAGE_CACHE_SIZE;
789 789
790 mlog(0,
791 "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
792 (unsigned long long)abs_from, (unsigned long long)abs_to,
793 index, zero_from, zero_to);
794
790 /* We know that zero_from is block aligned */ 795 /* We know that zero_from is block aligned */
791 for (block_start = zero_from; block_start < zero_to; 796 for (block_start = zero_from; block_start < zero_to;
792 block_start = block_end) { 797 block_start = block_end) {
@@ -833,25 +838,114 @@ out:
833 return ret; 838 return ret;
834} 839}
835 840
836static int ocfs2_zero_extend(struct inode *inode, 841/*
837 u64 zero_to_size) 842 * Find the next range to zero. We do this in terms of bytes because
843 * that's what ocfs2_zero_extend() wants, and it is dealing with the
844 * pagecache. We may return multiple extents.
845 *
846 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
847 * needs to be zeroed. range_start and range_end return the next zeroing
848 * range. A subsequent call should pass the previous range_end as its
849 * zero_start. If range_end is 0, there's nothing to do.
850 *
851 * Unwritten extents are skipped over. Refcounted extents are CoWd.
852 */
853static int ocfs2_zero_extend_get_range(struct inode *inode,
854 struct buffer_head *di_bh,
855 u64 zero_start, u64 zero_end,
856 u64 *range_start, u64 *range_end)
838{ 857{
839 int ret = 0; 858 int rc = 0, needs_cow = 0;
840 u64 start_off, next_off; 859 u32 p_cpos, zero_clusters = 0;
841 struct super_block *sb = inode->i_sb; 860 u32 zero_cpos =
861 zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
862 u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
863 unsigned int num_clusters = 0;
864 unsigned int ext_flags = 0;
842 865
843 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 866 while (zero_cpos < last_cpos) {
844 while (start_off < zero_to_size) { 867 rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
845 next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; 868 &num_clusters, &ext_flags);
846 if (next_off > zero_to_size) 869 if (rc) {
847 next_off = zero_to_size; 870 mlog_errno(rc);
848 ret = ocfs2_write_zero_page(inode, start_off, next_off); 871 goto out;
849 if (ret < 0) { 872 }
850 mlog_errno(ret); 873
874 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
875 zero_clusters = num_clusters;
876 if (ext_flags & OCFS2_EXT_REFCOUNTED)
877 needs_cow = 1;
878 break;
879 }
880
881 zero_cpos += num_clusters;
882 }
883 if (!zero_clusters) {
884 *range_end = 0;
885 goto out;
886 }
887
888 while ((zero_cpos + zero_clusters) < last_cpos) {
889 rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
890 &p_cpos, &num_clusters,
891 &ext_flags);
892 if (rc) {
893 mlog_errno(rc);
894 goto out;
895 }
896
897 if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
898 break;
899 if (ext_flags & OCFS2_EXT_REFCOUNTED)
900 needs_cow = 1;
901 zero_clusters += num_clusters;
902 }
903 if ((zero_cpos + zero_clusters) > last_cpos)
904 zero_clusters = last_cpos - zero_cpos;
905
906 if (needs_cow) {
907 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
908 UINT_MAX);
909 if (rc) {
910 mlog_errno(rc);
851 goto out; 911 goto out;
852 } 912 }
913 }
853 914
854 start_off = next_off; 915 *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
916 *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
917 zero_cpos + zero_clusters);
918
919out:
920 return rc;
921}
922
923/*
924 * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
925 * has made sure that the entire range needs zeroing.
926 */
927static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
928 u64 range_end)
929{
930 int rc = 0;
931 u64 next_pos;
932 u64 zero_pos = range_start;
933
934 mlog(0, "range_start = %llu, range_end = %llu\n",
935 (unsigned long long)range_start,
936 (unsigned long long)range_end);
937 BUG_ON(range_start >= range_end);
938
939 while (zero_pos < range_end) {
940 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
941 if (next_pos > range_end)
942 next_pos = range_end;
943 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
944 if (rc < 0) {
945 mlog_errno(rc);
946 break;
947 }
948 zero_pos = next_pos;
855 949
856 /* 950 /*
857 * Very large extends have the potential to lock up 951 * Very large extends have the potential to lock up
@@ -860,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
860 cond_resched(); 954 cond_resched();
861 } 955 }
862 956
863out: 957 return rc;
958}
959
960int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
961 loff_t zero_to_size)
962{
963 int ret = 0;
964 u64 zero_start, range_start = 0, range_end = 0;
965 struct super_block *sb = inode->i_sb;
966
967 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
968 mlog(0, "zero_start %llu for i_size %llu\n",
969 (unsigned long long)zero_start,
970 (unsigned long long)i_size_read(inode));
971 while (zero_start < zero_to_size) {
972 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
973 zero_to_size,
974 &range_start,
975 &range_end);
976 if (ret) {
977 mlog_errno(ret);
978 break;
979 }
980 if (!range_end)
981 break;
982 /* Trim the ends */
983 if (range_start < zero_start)
984 range_start = zero_start;
985 if (range_end > zero_to_size)
986 range_end = zero_to_size;
987
988 ret = ocfs2_zero_extend_range(inode, range_start,
989 range_end);
990 if (ret) {
991 mlog_errno(ret);
992 break;
993 }
994 zero_start = range_end;
995 }
996
864 return ret; 997 return ret;
865} 998}
866 999
867int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) 1000int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
1001 u64 new_i_size, u64 zero_to)
868{ 1002{
869 int ret; 1003 int ret;
870 u32 clusters_to_add; 1004 u32 clusters_to_add;
871 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1005 struct ocfs2_inode_info *oi = OCFS2_I(inode);
872 1006
1007 /*
1008 * Only quota files call this without a bh, and they can't be
1009 * refcounted.
1010 */
1011 BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
1012 BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1013
873 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); 1014 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
874 if (clusters_to_add < oi->ip_clusters) 1015 if (clusters_to_add < oi->ip_clusters)
875 clusters_to_add = 0; 1016 clusters_to_add = 0;
@@ -890,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
890 * still need to zero the area between the old i_size and the 1031 * still need to zero the area between the old i_size and the
891 * new i_size. 1032 * new i_size.
892 */ 1033 */
893 ret = ocfs2_zero_extend(inode, zero_to); 1034 ret = ocfs2_zero_extend(inode, di_bh, zero_to);
894 if (ret < 0) 1035 if (ret < 0)
895 mlog_errno(ret); 1036 mlog_errno(ret);
896 1037
@@ -912,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode,
912 goto out; 1053 goto out;
913 1054
914 if (i_size_read(inode) == new_i_size) 1055 if (i_size_read(inode) == new_i_size)
915 goto out; 1056 goto out;
916 BUG_ON(new_i_size < i_size_read(inode)); 1057 BUG_ON(new_i_size < i_size_read(inode));
917 1058
918 /* 1059 /*
919 * Fall through for converting inline data, even if the fs
920 * supports sparse files.
921 *
922 * The check for inline data here is legal - nobody can add
923 * the feature since we have i_mutex. We must check it again
924 * after acquiring ip_alloc_sem though, as paths like mmap
925 * might have raced us to converting the inode to extents.
926 */
927 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
928 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
929 goto out_update_size;
930
931 /*
932 * The alloc sem blocks people in read/write from reading our 1060 * The alloc sem blocks people in read/write from reading our
933 * allocation until we're done changing it. We depend on 1061 * allocation until we're done changing it. We depend on
934 * i_mutex to block other extend/truncate calls while we're 1062 * i_mutex to block other extend/truncate calls while we're
935 * here. 1063 * here. We even have to hold it for sparse files because there
1064 * might be some tail zeroing.
936 */ 1065 */
937 down_write(&oi->ip_alloc_sem); 1066 down_write(&oi->ip_alloc_sem);
938 1067
@@ -949,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
949 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1078 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
950 if (ret) { 1079 if (ret) {
951 up_write(&oi->ip_alloc_sem); 1080 up_write(&oi->ip_alloc_sem);
952
953 mlog_errno(ret); 1081 mlog_errno(ret);
954 goto out; 1082 goto out;
955 } 1083 }
956 } 1084 }
957 1085
958 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 1086 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
959 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); 1087 ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1088 else
1089 ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1090 new_i_size);
960 1091
961 up_write(&oi->ip_alloc_sem); 1092 up_write(&oi->ip_alloc_sem);
962 1093
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70e..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
54int ocfs2_simple_size_update(struct inode *inode, 54int ocfs2_simple_size_update(struct inode *inode,
55 struct buffer_head *di_bh, 55 struct buffer_head *di_bh,
56 u64 new_i_size); 56 u64 new_i_size);
57int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, 57int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
58 u64 zero_to); 58 u64 new_i_size, u64 zero_to);
59int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
60 loff_t zero_to);
59int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
60int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
61 struct kstat *stat); 63 struct kstat *stat);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bb35fe00511..4607923eb24c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
775 * locking allocators ranks above a transaction start 775 * locking allocators ranks above a transaction start
776 */ 776 */
777 WARN_ON(journal_current_handle()); 777 WARN_ON(journal_current_handle());
778 status = ocfs2_extend_no_holes(gqinode, 778 status = ocfs2_extend_no_holes(gqinode, NULL,
779 gqinode->i_size + (need_alloc << sb->s_blocksize_bits), 779 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
780 gqinode->i_size); 780 gqinode->i_size);
781 if (status < 0) 781 if (status < 0)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8bd70d4d184d..dc78764ccc4c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
971 u64 p_blkno; 971 u64 p_blkno;
972 972
973 /* We are protected by dqio_sem so no locking needed */ 973 /* We are protected by dqio_sem so no locking needed */
974 status = ocfs2_extend_no_holes(lqinode, 974 status = ocfs2_extend_no_holes(lqinode, NULL,
975 lqinode->i_size + 2 * sb->s_blocksize, 975 lqinode->i_size + 2 * sb->s_blocksize,
976 lqinode->i_size); 976 lqinode->i_size);
977 if (status < 0) { 977 if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1114 return ocfs2_local_quota_add_chunk(sb, type, offset); 1114 return ocfs2_local_quota_add_chunk(sb, type, offset);
1115 1115
1116 /* We are protected by dqio_sem so no locking needed */ 1116 /* We are protected by dqio_sem so no locking needed */
1117 status = ocfs2_extend_no_holes(lqinode, 1117 status = ocfs2_extend_no_holes(lqinode, NULL,
1118 lqinode->i_size + sb->s_blocksize, 1118 lqinode->i_size + sb->s_blocksize,
1119 lqinode->i_size); 1119 lqinode->i_size);
1120 if (status < 0) { 1120 if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4793f36f6518..32949df10694 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4166,6 +4166,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
4166 struct inode *inode = old_dentry->d_inode; 4166 struct inode *inode = old_dentry->d_inode;
4167 struct buffer_head *new_bh = NULL; 4167 struct buffer_head *new_bh = NULL;
4168 4168
4169 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
4170 ret = -EINVAL;
4171 mlog_errno(ret);
4172 goto out;
4173 }
4174
4169 ret = filemap_fdatawrite(inode->i_mapping); 4175 ret = filemap_fdatawrite(inode->i_mapping);
4170 if (ret) { 4176 if (ret) {
4171 mlog_errno(ret); 4177 mlog_errno(ret);