aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2011-01-10 18:22:40 -0500
committerDave Chinner <david@fromorbit.com>2011-01-10 18:22:40 -0500
commiteda77982729b7170bdc9e8855f0682edf322d277 (patch)
tree09ed733da9142ba979d6440add49f05772da11a4
parent4d8d15812fd9bc96d0da11467d23e0373feae933 (diff)
xfs: serialise unaligned direct IOs
When two concurrent unaligned, non-overlapping direct IOs are issued to the same block, the direct Io layer will race to zero the block. The result is that one of the concurrent IOs will overwrite data written by the other IO with zeros. This is demonstrated by the xfsqa test 240. To avoid this problem, serialise all unaligned direct IOs to an inode with a big hammer. We need a big hammer approach as we need to serialise AIO as well, so we can't just block writes on locks. Hence, the big hammer is calling xfs_ioend_wait() while holding out other unaligned direct IOs from starting. We don't bother trying to serialised aligned vs unaligned IOs as they are overlapping IO and the result of concurrent overlapping IOs is undefined - the result of either IO is a valid result so we let them race. Hence we only penalise unaligned IO, which already has a major overhead compared to aligned IO so this isn't a major problem. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Alex Elder <aelder@sgi.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c38
1 files changed, 28 insertions, 10 deletions
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5863dd8f448c..ef51eb43e137 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -684,9 +684,24 @@ xfs_file_aio_write_checks(
684 * xfs_file_dio_aio_write - handle direct IO writes 684 * xfs_file_dio_aio_write - handle direct IO writes
685 * 685 *
686 * Lock the inode appropriately to prepare for and issue a direct IO write. 686 * Lock the inode appropriately to prepare for and issue a direct IO write.
687 * By spearating it from the buffered write path we remove all the tricky to 687 * By separating it from the buffered write path we remove all the tricky to
688 * follow locking changes and looping. 688 * follow locking changes and looping.
689 * 689 *
690 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
691 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
692 * pages are flushed out.
693 *
694 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
695 * allowing them to be done in parallel with reads and other direct IO writes.
696 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
697 * needs to do sub-block zeroing and that requires serialisation against other
698 * direct IOs to the same block. In this case we need to serialise the
699 * submission of the unaligned IOs so that we don't get racing block zeroing in
700 * the dio layer. To avoid the problem with aio, we also need to wait for
701 * outstanding IOs to complete so that unwritten extent conversion is completed
702 * before we try to map the overlapping block. This is currently implemented by
703 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
704 *
690 * Returns with locks held indicated by @iolock and errors indicated by 705 * Returns with locks held indicated by @iolock and errors indicated by
691 * negative return values. 706 * negative return values.
692 */ 707 */
@@ -706,6 +721,7 @@ xfs_file_dio_aio_write(
706 struct xfs_mount *mp = ip->i_mount; 721 struct xfs_mount *mp = ip->i_mount;
707 ssize_t ret = 0; 722 ssize_t ret = 0;
708 size_t count = ocount; 723 size_t count = ocount;
724 int unaligned_io = 0;
709 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 725 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
710 mp->m_rtdev_targp : mp->m_ddev_targp; 726 mp->m_rtdev_targp : mp->m_ddev_targp;
711 727
@@ -713,13 +729,10 @@ xfs_file_dio_aio_write(
713 if ((pos & target->bt_smask) || (count & target->bt_smask)) 729 if ((pos & target->bt_smask) || (count & target->bt_smask))
714 return -XFS_ERROR(EINVAL); 730 return -XFS_ERROR(EINVAL);
715 731
716 /* 732 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
717 * For direct I/O, if there are cached pages or we're extending 733 unaligned_io = 1;
718 * the file, we need IOLOCK_EXCL until we're sure the bytes at 734
719 * the new EOF have been zeroed and/or the cached pages are 735 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
720 * flushed out.
721 */
722 if (mapping->nrpages || pos > ip->i_size)
723 *iolock = XFS_IOLOCK_EXCL; 736 *iolock = XFS_IOLOCK_EXCL;
724 else 737 else
725 *iolock = XFS_IOLOCK_SHARED; 738 *iolock = XFS_IOLOCK_SHARED;
@@ -737,8 +750,13 @@ xfs_file_dio_aio_write(
737 return ret; 750 return ret;
738 } 751 }
739 752
740 if (*iolock == XFS_IOLOCK_EXCL) { 753 /*
741 /* demote the lock now the cached pages are gone */ 754 * If we are doing unaligned IO, wait for all other IO to drain,
755 * otherwise demote the lock if we had to flush cached pages
756 */
757 if (unaligned_io)
758 xfs_ioend_wait(ip);
759 else if (*iolock == XFS_IOLOCK_EXCL) {
742 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 760 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
743 *iolock = XFS_IOLOCK_SHARED; 761 *iolock = XFS_IOLOCK_SHARED;
744 } 762 }