aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2015-04-16 08:03:07 -0400
committerDave Chinner <david@fromorbit.com>2015-04-16 08:03:07 -0400
commitb9d59846f73713d77f0f3fb784c7f84249fc2b93 (patch)
treeba9f8b67c419b208d8c8cd3857b5cea6f27cfca7 /fs/xfs
parenta06c277a13c3620c8ee9304891758f2fcff9c4a4 (diff)
xfs: DIO write completion size updates race
xfs_end_io_direct_write() can race with other IO completions when updating the in-core inode size. The IO completion processing is not serialised for direct IO - they are done either under the IOLOCK_SHARED for non-AIO DIO, and without any IOLOCK held at all during AIO DIO completion. Hence the non-atomic test-and-set update of the in-core inode size is racy and can result in the in-core inode size going backwards if the race if hit just right. If the inode size goes backwards, this can trigger the EOF zeroing code to run incorrectly on the next IO, which then will zero data that has successfully been written to disk by a previous DIO. To fix this bug, we need to serialise the test/set updates of the in-core inode size. This first patch introduces locking around the relevant updates and checks in the DIO path. Because we now have an ioend in xfs_end_io_direct_write(), we know exactly then we are doing an IO that requires an in-core EOF update, and we know that they are not running in interrupt context. As such, we do not need to use irqsave() spinlock variants to protect against interrupts while the lock is held. Hence we can use an existing spinlock in the inode to do this serialisation and so not need to grow the struct xfs_inode just to work around this problem. This patch does not address the test/set EOF update in generic_file_write_direct() for various reasons - that will be done as a followup with separate explanation. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_aops.c7
-rw-r--r--fs/xfs/xfs_file.c13
2 files changed, 19 insertions, 1 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index c02a47453137..598b259fda04 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1582,9 +1582,16 @@ xfs_end_io_direct_write(
1582 * with the on-disk inode size being outside the in-core inode size. We 1582 * with the on-disk inode size being outside the in-core inode size. We
1583 * have no other method of updating EOF for AIO, so always do it here 1583 * have no other method of updating EOF for AIO, so always do it here
1584 * if necessary. 1584 * if necessary.
1585 *
1586 * We need to lock the test/set EOF update as we can be racing with
1587 * other IO completions here to update the EOF. Failing to serialise
1588 * here can result in EOF moving backwards and Bad Things Happen when
1589 * that occurs.
1585 */ 1590 */
1591 spin_lock(&ip->i_flags_lock);
1586 if (offset + size > i_size_read(inode)) 1592 if (offset + size > i_size_read(inode))
1587 i_size_write(inode, offset + size); 1593 i_size_write(inode, offset + size);
1594 spin_unlock(&ip->i_flags_lock);
1588 1595
1589 /* 1596 /*
1590 * If we are doing an append IO that needs to update the EOF on disk, 1597 * If we are doing an append IO that needs to update the EOF on disk,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ce615d12fb44..2323b8b63183 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -565,8 +565,18 @@ restart:
565 * write. If zeroing is needed and we are currently holding the 565 * write. If zeroing is needed and we are currently holding the
566 * iolock shared, we need to update it to exclusive which implies 566 * iolock shared, we need to update it to exclusive which implies
567 * having to redo all checks before. 567 * having to redo all checks before.
568 *
569 * We need to serialise against EOF updates that occur in IO
570 * completions here. We want to make sure that nobody is changing the
571 * size while we do this check until we have placed an IO barrier (i.e.
572 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
573 * The spinlock effectively forms a memory barrier once we have the
574 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
575 * and hence be able to correctly determine if we need to run zeroing.
568 */ 576 */
577 spin_lock(&ip->i_flags_lock);
569 if (*pos > i_size_read(inode)) { 578 if (*pos > i_size_read(inode)) {
579 spin_unlock(&ip->i_flags_lock);
570 if (*iolock == XFS_IOLOCK_SHARED) { 580 if (*iolock == XFS_IOLOCK_SHARED) {
571 xfs_rw_iunlock(ip, *iolock); 581 xfs_rw_iunlock(ip, *iolock);
572 *iolock = XFS_IOLOCK_EXCL; 582 *iolock = XFS_IOLOCK_EXCL;
@@ -576,7 +586,8 @@ restart:
576 error = xfs_zero_eof(ip, *pos, i_size_read(inode)); 586 error = xfs_zero_eof(ip, *pos, i_size_read(inode));
577 if (error) 587 if (error)
578 return error; 588 return error;
579 } 589 } else
590 spin_unlock(&ip->i_flags_lock);
580 591
581 /* 592 /*
582 * Updating the timestamps will grab the ilock again from 593 * Updating the timestamps will grab the ilock again from