diff options
author | Joel Becker <joel.becker@oracle.com> | 2010-07-01 18:13:31 -0400 |
---|---|---|
committer | Joel Becker <joel.becker@oracle.com> | 2010-07-08 16:25:35 -0400 |
commit | 5693486bad2bc2ac585a2c24f7e2f3964b478df9 (patch) | |
tree | 03d61d72c1b73bbf0b049bf0328f8e0c69f35a43 /fs/ocfs2/aops.c | |
parent | a4bfb4cf11fd2211b788af59dc8a8b4394bca227 (diff) |
ocfs2: Zero the tail cluster when extending past i_size.
ocfs2's allocation unit is the cluster. This can be larger than a block
or even a memory page. This means that a file may have many blocks in
its last extent that are beyond the block containing i_size. There also
may be more unwritten extents after that.
When ocfs2 grows a file, it zeros the entire cluster in order to ensure
future i_size growth will see cleared blocks. Unfortunately,
block_write_full_page() drops the pages past i_size. This means that
ocfs2 is actually leaking garbage data into the tail end of that last
cluster. This is a bug.
We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
when a write or truncate is past i_size. They will use
ocfs2_zero_extend() to ensure the data is properly zeroed.
Older versions of ocfs2_zero_extend() simply zeroed every block between
i_size and the zeroing position. This presumes three things:
1) There is allocation for all of these blocks.
2) The extents are not unwritten.
3) The extents are not refcounted.
(1) and (2) hold true for non-sparse filesystems, which used to be the
only users of ocfs2_zero_extend(). (3) is another bug.
Since we're now using ocfs2_zero_extend() for sparse filesystems as
well, we teach ocfs2_zero_extend() to check every extent between
i_size and the zeroing position. If the extent is unwritten, it is
ignored. If it is refcounted, it is CoWed. Then it is zeroed.
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Cc: stable@kernel.org
Diffstat (limited to 'fs/ocfs2/aops.c')
-rw-r--r-- | fs/ocfs2/aops.c | 42 |
1 files changed, 28 insertions, 14 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 9a5c931439bd..742893ea7390 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
196 | dump_stack(); | 196 | dump_stack(); |
197 | goto bail; | 197 | goto bail; |
198 | } | 198 | } |
199 | |||
200 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | ||
201 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, | ||
202 | (unsigned long long)past_eof); | ||
203 | |||
204 | if (create && (iblock >= past_eof)) | ||
205 | set_buffer_new(bh_result); | ||
206 | } | 199 | } |
207 | 200 | ||
201 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | ||
202 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, | ||
203 | (unsigned long long)past_eof); | ||
204 | if (create && (iblock >= past_eof)) | ||
205 | set_buffer_new(bh_result); | ||
206 | |||
208 | bail: | 207 | bail: |
209 | if (err < 0) | 208 | if (err < 0) |
210 | err = -EIO; | 209 | err = -EIO; |
@@ -1590,21 +1589,20 @@ out: | |||
1590 | * write path can treat it as an non-allocating write, which has no | 1589 | * write path can treat it as an non-allocating write, which has no |
1591 | * special case code for sparse/nonsparse files. | 1590 | * special case code for sparse/nonsparse files. |
1592 | */ | 1591 | */ |
1593 | static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, | 1592 | static int ocfs2_expand_nonsparse_inode(struct inode *inode, |
1594 | unsigned len, | 1593 | struct buffer_head *di_bh, |
1594 | loff_t pos, unsigned len, | ||
1595 | struct ocfs2_write_ctxt *wc) | 1595 | struct ocfs2_write_ctxt *wc) |
1596 | { | 1596 | { |
1597 | int ret; | 1597 | int ret; |
1598 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1599 | loff_t newsize = pos + len; | 1598 | loff_t newsize = pos + len; |
1600 | 1599 | ||
1601 | if (ocfs2_sparse_alloc(osb)) | 1600 | BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); |
1602 | return 0; | ||
1603 | 1601 | ||
1604 | if (newsize <= i_size_read(inode)) | 1602 | if (newsize <= i_size_read(inode)) |
1605 | return 0; | 1603 | return 0; |
1606 | 1604 | ||
1607 | ret = ocfs2_extend_no_holes(inode, newsize, pos); | 1605 | ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos); |
1608 | if (ret) | 1606 | if (ret) |
1609 | mlog_errno(ret); | 1607 | mlog_errno(ret); |
1610 | 1608 | ||
@@ -1614,6 +1612,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, | |||
1614 | return ret; | 1612 | return ret; |
1615 | } | 1613 | } |
1616 | 1614 | ||
1615 | static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, | ||
1616 | loff_t pos) | ||
1617 | { | ||
1618 | int ret = 0; | ||
1619 | |||
1620 | BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); | ||
1621 | if (pos > i_size_read(inode)) | ||
1622 | ret = ocfs2_zero_extend(inode, di_bh, pos); | ||
1623 | |||
1624 | return ret; | ||
1625 | } | ||
1626 | |||
1617 | int ocfs2_write_begin_nolock(struct address_space *mapping, | 1627 | int ocfs2_write_begin_nolock(struct address_space *mapping, |
1618 | loff_t pos, unsigned len, unsigned flags, | 1628 | loff_t pos, unsigned len, unsigned flags, |
1619 | struct page **pagep, void **fsdata, | 1629 | struct page **pagep, void **fsdata, |
@@ -1649,7 +1659,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, | |||
1649 | } | 1659 | } |
1650 | } | 1660 | } |
1651 | 1661 | ||
1652 | ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); | 1662 | if (ocfs2_sparse_alloc(osb)) |
1663 | ret = ocfs2_zero_tail(inode, di_bh, pos); | ||
1664 | else | ||
1665 | ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, | ||
1666 | wc); | ||
1653 | if (ret) { | 1667 | if (ret) { |
1654 | mlog_errno(ret); | 1668 | mlog_errno(ret); |
1655 | goto out; | 1669 | goto out; |