ocfs2: serialize unaligned aio

Fix a corruption that can happen when we have (two or more) outstanding aio's to an overlapping unaligned region. Ext4 (e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix similar issues. In our case what happens is that we can have an outstanding aio on a region and if a write comes in with some bytes overlapping the original aio we may decide to read that region into a page before continuing (typically because of buffered-io fallback). Since we have no ordering guarantees with the aio, we can read stale or bad data into the page and then write it back out. If the i/o is page and block aligned, then we avoid this issue as there won't be any need to read data from disk. I took the same approach as Eric in the ext4 patch and introduced some serialization of unaligned async direct i/o. I don't expect this to have an effect on the most common cases of AIO. Unaligned aio will be slower though, but that's far more acceptable than data corruption. Signed-off-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <jlbec@evilplan.org>
author: Mark Fasheh <mfasheh@suse.com> 2011-06-22 17:23:38 -0400
committer: Joel Becker <jlbec@evilplan.org> 2011-07-28 05:07:16 -0400
commit: a11f7e63c59810a81494d4c4b028af707d4c7ca4 (patch)
tree: 6d28cfc9519f96db5c20780bf765de9e0fc03bef /fs/ocfs2/file.c
parent: 730e663bd82c1a10a85ff00728d34152a5a67ec8 (diff)
1 files changed, 38 insertions, 0 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b1e35a392ca5..145f4533a936 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2038,6 +2038,23 @@ out:
        return ret;
 }
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+        int blockmask = inode->i_sb->s_blocksize - 1;
+        loff_t final_size = pos + count;
+        if ((pos & blockmask) || (final_size & blockmask))
+                return 1;
+        return 0;
+}
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
                                            struct file *file,
                                            loff_t pos, size_t count,
@@ -2216,6 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
+        int unaligned_dio = 0;
        trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
                (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2284,6 +2302,10 @@ relock:
                goto out;
        }
+        if (direct_io && !is_sync_kiocb(iocb))
+                unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+                                                      *ppos);
        /*
         * We can't complete the direct I/O as requested, fall back to
         * buffered I/O.
@@ -2299,6 +2321,18 @@ relock:
                goto relock;
        }
+        if (unaligned_dio) {
+                /*
+                 * Wait on previous unaligned aio to complete before
+                 * proceeding.
+                 */
+                ocfs2_aiodio_wait(inode);
+                /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+                atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+                ocfs2_iocb_set_unaligned_aio(iocb);
+        }
        /*
         * To later detect whether a journal commit for sync writes is
         * necessary, we sample i_size, and cluster count here.
@@ -2371,8 +2405,12 @@ out_dio:
        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
                have_alloc_sem = 0;
+                unaligned_dio = 0;
        }
+        if (unaligned_dio)
+                atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
 out:
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
author	Mark Fasheh <mfasheh@suse.com>	2011-06-22 17:23:38 -0400
committer	Joel Becker <jlbec@evilplan.org>	2011-07-28 05:07:16 -0400
commit	a11f7e63c59810a81494d4c4b028af707d4c7ca4 (patch)
tree	6d28cfc9519f96db5c20780bf765de9e0fc03bef /fs/ocfs2/file.c
parent	730e663bd82c1a10a85ff00728d34152a5a67ec8 (diff)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index b1e35a392ca5..145f4533a936 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c
@@ -2038,6 +2038,23 @@ out:
2038	return ret;	2038	return ret;
2039	}	2039	}
2040		2040
		2041	static void ocfs2_aiodio_wait(struct inode *inode)
		2042	{
		2043	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
		2044
		2045	wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
		2046	}
		2047
		2048	static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
		2049	{
		2050	int blockmask = inode->i_sb->s_blocksize - 1;
		2051	loff_t final_size = pos + count;
		2052
		2053	if ((pos & blockmask) \|\| (final_size & blockmask))
		2054	return 1;
		2055	return 0;
		2056	}
		2057
2041	static int ocfs2_prepare_inode_for_refcount(struct inode *inode,	2058	static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2042	struct file *file,	2059	struct file *file,
2043	loff_t pos, size_t count,	2060	loff_t pos, size_t count,
@@ -2216,6 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2216	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	2233	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2217	int full_coherency = !(osb->s_mount_opt &	2234	int full_coherency = !(osb->s_mount_opt &
2218	OCFS2_MOUNT_COHERENCY_BUFFERED);	2235	OCFS2_MOUNT_COHERENCY_BUFFERED);
		2236	int unaligned_dio = 0;
2219		2237
2220	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,	2238	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2221	(unsigned long long)OCFS2_I(inode)->ip_blkno,	2239	(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2284,6 +2302,10 @@ relock:
2284	goto out;	2302	goto out;
2285	}	2303	}
2286		2304
		2305	if (direct_io && !is_sync_kiocb(iocb))
		2306	unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
		2307	*ppos);
		2308
2287	/*	2309	/*
2288	* We can't complete the direct I/O as requested, fall back to	2310	* We can't complete the direct I/O as requested, fall back to
2289	* buffered I/O.	2311	* buffered I/O.
@@ -2299,6 +2321,18 @@ relock:
2299	goto relock;	2321	goto relock;
2300	}	2322	}
2301		2323
		2324	if (unaligned_dio) {
		2325	/*
		2326	* Wait on previous unaligned aio to complete before
		2327	* proceeding.
		2328	*/
		2329	ocfs2_aiodio_wait(inode);
		2330
		2331	/* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
		2332	atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
		2333	ocfs2_iocb_set_unaligned_aio(iocb);
		2334	}
		2335
2302	/*	2336	/*
2303	* To later detect whether a journal commit for sync writes is	2337	* To later detect whether a journal commit for sync writes is
2304	* necessary, we sample i_size, and cluster count here.	2338	* necessary, we sample i_size, and cluster count here.
@@ -2371,8 +2405,12 @@ out_dio:
2371	if ((ret == -EIOCBQUEUED) \|\| (!ocfs2_iocb_is_rw_locked(iocb))) {	2405	if ((ret == -EIOCBQUEUED) \|\| (!ocfs2_iocb_is_rw_locked(iocb))) {
2372	rw_level = -1;	2406	rw_level = -1;
2373	have_alloc_sem = 0;	2407	have_alloc_sem = 0;
		2408	unaligned_dio = 0;
2374	}	2409	}
2375		2410
		2411	if (unaligned_dio)
		2412	atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
		2413
2376	out:	2414	out:
2377	if (rw_level != -1)	2415	if (rw_level != -1)
2378	ocfs2_rw_unlock(inode, rw_level);	2416	ocfs2_rw_unlock(inode, rw_level);