xfs: serialise unaligned direct IOs

When two concurrent unaligned, non-overlapping direct IOs are issued to the same block, the direct Io layer will race to zero the block. The result is that one of the concurrent IOs will overwrite data written by the other IO with zeros. This is demonstrated by the xfsqa test 240. To avoid this problem, serialise all unaligned direct IOs to an inode with a big hammer. We need a big hammer approach as we need to serialise AIO as well, so we can't just block writes on locks. Hence, the big hammer is calling xfs_ioend_wait() while holding out other unaligned direct IOs from starting. We don't bother trying to serialised aligned vs unaligned IOs as they are overlapping IO and the result of concurrent overlapping IOs is undefined - the result of either IO is a valid result so we let them race. Hence we only penalise unaligned IO, which already has a major overhead compared to aligned IO so this isn't a major problem. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Alex Elder <aelder@sgi.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
author: Dave Chinner <dchinner@redhat.com> 2011-01-10 18:22:40 -0500
committer: Dave Chinner <david@fromorbit.com> 2011-01-10 18:22:40 -0500
commit: eda77982729b7170bdc9e8855f0682edf322d277 (patch)
tree: 09ed733da9142ba979d6440add49f05772da11a4
parent: 4d8d15812fd9bc96d0da11467d23e0373feae933 (diff)
1 files changed, 28 insertions, 10 deletions
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5863dd8f448c..ef51eb43e137 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -684,9 +684,24 @@ xfs_file_aio_write_checks(
 * xfs_file_dio_aio_write - handle direct IO writes
 *
 * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By spearating it from the buffered write path we remove all the tricky to
+ * By separating it from the buffered write path we remove all the tricky to
 * follow locking changes and looping.
 *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
 * Returns with locks held indicated by @iolock and errors indicated by
 * negative return values.
 */
@@ -706,6 +721,7 @@ xfs_file_dio_aio_write(
        struct xfs_mount        *mp = ip->i_mount;
        ssize_t                 ret = 0;
        size_t                  count = ocount;
+        int                     unaligned_io = 0;
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -713,13 +729,10 @@ xfs_file_dio_aio_write(
        if ((pos & target->bt_smask) || (count & target->bt_smask))
                return -XFS_ERROR(EINVAL);
-        /*
+        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
-         * For direct I/O, if there are cached pages or we're extending
+                unaligned_io = 1;
-         * the file, we need IOLOCK_EXCL until we're sure the bytes at
-         * the new EOF have been zeroed and/or the cached pages are
+        if (unaligned_io || mapping->nrpages || pos > ip->i_size)
-         * flushed out.
-         */
-        if (mapping->nrpages || pos > ip->i_size)
                *iolock = XFS_IOLOCK_EXCL;
        else
                *iolock = XFS_IOLOCK_SHARED;
@@ -737,8 +750,13 @@ xfs_file_dio_aio_write(
                        return ret;
        }
-        if (*iolock == XFS_IOLOCK_EXCL) {
+        /*
-                /* demote the lock now the cached pages are gone */
+         * If we are doing unaligned IO, wait for all other IO to drain,
+         * otherwise demote the lock if we had to flush cached pages
+         */
+        if (unaligned_io)
+                xfs_ioend_wait(ip);
+        else if (*iolock == XFS_IOLOCK_EXCL) {
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
                *iolock = XFS_IOLOCK_SHARED;
        }
author	Dave Chinner <dchinner@redhat.com>	2011-01-10 18:22:40 -0500
committer	Dave Chinner <david@fromorbit.com>	2011-01-10 18:22:40 -0500
commit	eda77982729b7170bdc9e8855f0682edf322d277 (patch)
tree	09ed733da9142ba979d6440add49f05772da11a4
parent	4d8d15812fd9bc96d0da11467d23e0373feae933 (diff)

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 5863dd8f448c..ef51eb43e137 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -684,9 +684,24 @@ xfs_file_aio_write_checks(
684	* xfs_file_dio_aio_write - handle direct IO writes	684	* xfs_file_dio_aio_write - handle direct IO writes
685	*	685	*
686	* Lock the inode appropriately to prepare for and issue a direct IO write.	686	* Lock the inode appropriately to prepare for and issue a direct IO write.
687	* By spearating it from the buffered write path we remove all the tricky to	687	* By separating it from the buffered write path we remove all the tricky to
688	* follow locking changes and looping.	688	* follow locking changes and looping.
689	*	689	*
		690	* If there are cached pages or we're extending the file, we need IOLOCK_EXCL
		691	* until we're sure the bytes at the new EOF have been zeroed and/or the cached
		692	* pages are flushed out.
		693	*
		694	* In most cases the direct IO writes will be done holding IOLOCK_SHARED
		695	* allowing them to be done in parallel with reads and other direct IO writes.
		696	* However, if the IO is not aligned to filesystem blocks, the direct IO layer
		697	* needs to do sub-block zeroing and that requires serialisation against other
		698	* direct IOs to the same block. In this case we need to serialise the
		699	* submission of the unaligned IOs so that we don't get racing block zeroing in
		700	* the dio layer. To avoid the problem with aio, we also need to wait for
		701	* outstanding IOs to complete so that unwritten extent conversion is completed
		702	* before we try to map the overlapping block. This is currently implemented by
		703	* hitting it with a big hammer (i.e. xfs_ioend_wait()).
		704	*
690	* Returns with locks held indicated by @iolock and errors indicated by	705	* Returns with locks held indicated by @iolock and errors indicated by
691	* negative return values.	706	* negative return values.
692	*/	707	*/
@@ -706,6 +721,7 @@ xfs_file_dio_aio_write(
706	struct xfs_mount *mp = ip->i_mount;	721	struct xfs_mount *mp = ip->i_mount;
707	ssize_t ret = 0;	722	ssize_t ret = 0;
708	size_t count = ocount;	723	size_t count = ocount;
		724	int unaligned_io = 0;
709	struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?	725	struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
710	mp->m_rtdev_targp : mp->m_ddev_targp;	726	mp->m_rtdev_targp : mp->m_ddev_targp;
711		727
@@ -713,13 +729,10 @@ xfs_file_dio_aio_write(
713	if ((pos & target->bt_smask) \|\| (count & target->bt_smask))	729	if ((pos & target->bt_smask) \|\| (count & target->bt_smask))
714	return -XFS_ERROR(EINVAL);	730	return -XFS_ERROR(EINVAL);
715		731
716	/*	732	if ((pos & mp->m_blockmask) \|\| ((pos + count) & mp->m_blockmask))
717	* For direct I/O, if there are cached pages or we're extending	733	unaligned_io = 1;
718	* the file, we need IOLOCK_EXCL until we're sure the bytes at	734
719	* the new EOF have been zeroed and/or the cached pages are	735	if (unaligned_io \|\| mapping->nrpages \|\| pos > ip->i_size)
720	* flushed out.
721	*/
722	if (mapping->nrpages \|\| pos > ip->i_size)
723	*iolock = XFS_IOLOCK_EXCL;	736	*iolock = XFS_IOLOCK_EXCL;
724	else	737	else
725	*iolock = XFS_IOLOCK_SHARED;	738	*iolock = XFS_IOLOCK_SHARED;
@@ -737,8 +750,13 @@ xfs_file_dio_aio_write(
737	return ret;	750	return ret;
738	}	751	}
739		752
740	if (*iolock == XFS_IOLOCK_EXCL) {	753	/*
741	/* demote the lock now the cached pages are gone */	754	* If we are doing unaligned IO, wait for all other IO to drain,
		755	* otherwise demote the lock if we had to flush cached pages
		756	*/
		757	if (unaligned_io)
		758	xfs_ioend_wait(ip);
		759	else if (*iolock == XFS_IOLOCK_EXCL) {
742	xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);	760	xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
743	*iolock = XFS_IOLOCK_SHARED;	761	*iolock = XFS_IOLOCK_SHARED;
744	}	762	}