aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2015-02-23 06:37:08 -0500
committerDave Chinner <david@fromorbit.com>2015-02-23 06:37:08 -0500
commit5885ebda878b47c4b4602d4b0410cb4b282af024 (patch)
tree5b4f5bf3bdd9666b66218cf03e8780cf644dcd43
parentdfcc70a8c868fe03276fa59864149708fb41930b (diff)
xfs: ensure truncate forces zeroed blocks to disk
A new fsync vs power fail test in xfstests indicated that XFS can have unreliable data consistency when doing extending truncates that require block zeroing. The blocks beyond EOF get zeroed in memory, but we never force those changes to disk before we run the transaction that extends the file size and exposes those blocks to userspace. This can result in the blocks not being correctly zeroed after a crash. Because in-memory behaviour is correct, tools like fsx don't pick up any coherency problems - it's not until the filesystem is shutdown or the system crashes after writing the truncate transaction to the journal but before the zeroed data in the page cache is flushed that the issue is exposed. Fix this by also flushing the dirty data in memory region between the old size and new size when we've found blocks that need zeroing in the truncate process. Reported-by: Liu Bo <bo.li.liu@oracle.com> cc: <stable@vger.kernel.org> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
-rw-r--r--fs/xfs/xfs_file.c14
-rw-r--r--fs/xfs/xfs_inode.h9
-rw-r--r--fs/xfs/xfs_iops.c36
3 files changed, 29 insertions, 30 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ce615d12fb44..a2e1cb8a568b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -397,7 +397,8 @@ STATIC int /* error (positive) */
397xfs_zero_last_block( 397xfs_zero_last_block(
398 struct xfs_inode *ip, 398 struct xfs_inode *ip,
399 xfs_fsize_t offset, 399 xfs_fsize_t offset,
400 xfs_fsize_t isize) 400 xfs_fsize_t isize,
401 bool *did_zeroing)
401{ 402{
402 struct xfs_mount *mp = ip->i_mount; 403 struct xfs_mount *mp = ip->i_mount;
403 xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); 404 xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
@@ -425,6 +426,7 @@ xfs_zero_last_block(
425 zero_len = mp->m_sb.sb_blocksize - zero_offset; 426 zero_len = mp->m_sb.sb_blocksize - zero_offset;
426 if (isize + zero_len > offset) 427 if (isize + zero_len > offset)
427 zero_len = offset - isize; 428 zero_len = offset - isize;
429 *did_zeroing = true;
428 return xfs_iozero(ip, isize, zero_len); 430 return xfs_iozero(ip, isize, zero_len);
429} 431}
430 432
@@ -443,7 +445,8 @@ int /* error (positive) */
443xfs_zero_eof( 445xfs_zero_eof(
444 struct xfs_inode *ip, 446 struct xfs_inode *ip,
445 xfs_off_t offset, /* starting I/O offset */ 447 xfs_off_t offset, /* starting I/O offset */
446 xfs_fsize_t isize) /* current inode size */ 448 xfs_fsize_t isize, /* current inode size */
449 bool *did_zeroing)
447{ 450{
448 struct xfs_mount *mp = ip->i_mount; 451 struct xfs_mount *mp = ip->i_mount;
449 xfs_fileoff_t start_zero_fsb; 452 xfs_fileoff_t start_zero_fsb;
@@ -465,7 +468,7 @@ xfs_zero_eof(
465 * We only zero a part of that block so it is handled specially. 468 * We only zero a part of that block so it is handled specially.
466 */ 469 */
467 if (XFS_B_FSB_OFFSET(mp, isize) != 0) { 470 if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
468 error = xfs_zero_last_block(ip, offset, isize); 471 error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
469 if (error) 472 if (error)
470 return error; 473 return error;
471 } 474 }
@@ -525,6 +528,7 @@ xfs_zero_eof(
525 if (error) 528 if (error)
526 return error; 529 return error;
527 530
531 *did_zeroing = true;
528 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 532 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
529 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 533 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
530 } 534 }
@@ -567,13 +571,15 @@ restart:
567 * having to redo all checks before. 571 * having to redo all checks before.
568 */ 572 */
569 if (*pos > i_size_read(inode)) { 573 if (*pos > i_size_read(inode)) {
574 bool zero = false;
575
570 if (*iolock == XFS_IOLOCK_SHARED) { 576 if (*iolock == XFS_IOLOCK_SHARED) {
571 xfs_rw_iunlock(ip, *iolock); 577 xfs_rw_iunlock(ip, *iolock);
572 *iolock = XFS_IOLOCK_EXCL; 578 *iolock = XFS_IOLOCK_EXCL;
573 xfs_rw_ilock(ip, *iolock); 579 xfs_rw_ilock(ip, *iolock);
574 goto restart; 580 goto restart;
575 } 581 }
576 error = xfs_zero_eof(ip, *pos, i_size_read(inode)); 582 error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
577 if (error) 583 if (error)
578 return error; 584 return error;
579 } 585 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 86cd6b39bed7..a1cd55f3f351 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -384,10 +384,11 @@ enum xfs_prealloc_flags {
384 XFS_PREALLOC_INVISIBLE = (1 << 4), 384 XFS_PREALLOC_INVISIBLE = (1 << 4),
385}; 385};
386 386
387int xfs_update_prealloc_flags(struct xfs_inode *, 387int xfs_update_prealloc_flags(struct xfs_inode *ip,
388 enum xfs_prealloc_flags); 388 enum xfs_prealloc_flags flags);
389int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 389int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
390int xfs_iozero(struct xfs_inode *, loff_t, size_t); 390 xfs_fsize_t isize, bool *did_zeroing);
391int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
391 392
392 393
393#define IHOLD(ip) \ 394#define IHOLD(ip) \
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d919ad7b16bf..e53a90331422 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -751,6 +751,7 @@ xfs_setattr_size(
751 int error; 751 int error;
752 uint lock_flags = 0; 752 uint lock_flags = 0;
753 uint commit_flags = 0; 753 uint commit_flags = 0;
754 bool did_zeroing = false;
754 755
755 trace_xfs_setattr(ip); 756 trace_xfs_setattr(ip);
756 757
@@ -794,20 +795,16 @@ xfs_setattr_size(
794 return error; 795 return error;
795 796
796 /* 797 /*
797 * Now we can make the changes. Before we join the inode to the 798 * File data changes must be complete before we start the transaction to
798 * transaction, take care of the part of the truncation that must be 799 * modify the inode. This needs to be done before joining the inode to
799 * done without the inode lock. This needs to be done before joining 800 * the transaction because the inode cannot be unlocked once it is a
800 * the inode to the transaction, because the inode cannot be unlocked 801 * part of the transaction.
801 * once it is a part of the transaction. 802 *
803 * Start with zeroing any data block beyond EOF that we may expose on
804 * file extension.
802 */ 805 */
803 if (newsize > oldsize) { 806 if (newsize > oldsize) {
804 /* 807 error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
805 * Do the first part of growing a file: zero any data in the
806 * last block that is beyond the old EOF. We need to do this
807 * before the inode is joined to the transaction to modify
808 * i_size.
809 */
810 error = xfs_zero_eof(ip, newsize, oldsize);
811 if (error) 808 if (error)
812 return error; 809 return error;
813 } 810 }
@@ -817,23 +814,18 @@ xfs_setattr_size(
817 * any previous writes that are beyond the on disk EOF and the new 814 * any previous writes that are beyond the on disk EOF and the new
818 * EOF that have not been written out need to be written here. If we 815 * EOF that have not been written out need to be written here. If we
819 * do not write the data out, we expose ourselves to the null files 816 * do not write the data out, we expose ourselves to the null files
820 * problem. 817 * problem. Note that this includes any block zeroing we did above;
821 * 818 * otherwise those blocks may not be zeroed after a crash.
822 * Only flush from the on disk size to the smaller of the in memory
823 * file size or the new size as that's the range we really care about
824 * here and prevents waiting for other data not within the range we
825 * care about here.
826 */ 819 */
827 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { 820 if (newsize > ip->i_d.di_size &&
821 (oldsize != ip->i_d.di_size || did_zeroing)) {
828 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 822 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
829 ip->i_d.di_size, newsize); 823 ip->i_d.di_size, newsize);
830 if (error) 824 if (error)
831 return error; 825 return error;
832 } 826 }
833 827
834 /* 828 /* Now wait for all direct I/O to complete. */
835 * Wait for all direct I/O to complete.
836 */
837 inode_dio_wait(inode); 829 inode_dio_wait(inode);
838 830
839 /* 831 /*