aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/file.c
diff options
context:
space:
mode:
authorMark Fasheh <mfasheh@suse.com>2011-06-22 17:23:38 -0400
committerJoel Becker <jlbec@evilplan.org>2011-07-28 05:07:16 -0400
commita11f7e63c59810a81494d4c4b028af707d4c7ca4 (patch)
tree6d28cfc9519f96db5c20780bf765de9e0fc03bef /fs/ocfs2/file.c
parent730e663bd82c1a10a85ff00728d34152a5a67ec8 (diff)
ocfs2: serialize unaligned aio
Fix a corruption that can happen when we have (two or more) outstanding aio's to an overlapping unaligned region. Ext4 (e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix similar issues. In our case what happens is that we can have an outstanding aio on a region and if a write comes in with some bytes overlapping the original aio we may decide to read that region into a page before continuing (typically because of buffered-io fallback). Since we have no ordering guarantees with the aio, we can read stale or bad data into the page and then write it back out. If the i/o is page and block aligned, then we avoid this issue as there won't be any need to read data from disk. I took the same approach as Eric in the ext4 patch and introduced some serialization of unaligned async direct i/o. I don't expect this to have an effect on the most common cases of AIO. Unaligned aio will be slower though, but that's far more acceptable than data corruption. Signed-off-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <jlbec@evilplan.org>
Diffstat (limited to 'fs/ocfs2/file.c')
-rw-r--r--fs/ocfs2/file.c38
1 files changed, 38 insertions, 0 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b1e35a392ca5..145f4533a936 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2038,6 +2038,23 @@ out:
2038 return ret; 2038 return ret;
2039} 2039}
2040 2040
2041static void ocfs2_aiodio_wait(struct inode *inode)
2042{
2043 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2044
2045 wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2046}
2047
2048static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2049{
2050 int blockmask = inode->i_sb->s_blocksize - 1;
2051 loff_t final_size = pos + count;
2052
2053 if ((pos & blockmask) || (final_size & blockmask))
2054 return 1;
2055 return 0;
2056}
2057
2041static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2058static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2042 struct file *file, 2059 struct file *file,
2043 loff_t pos, size_t count, 2060 loff_t pos, size_t count,
@@ -2216,6 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2216 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2217 int full_coherency = !(osb->s_mount_opt & 2234 int full_coherency = !(osb->s_mount_opt &
2218 OCFS2_MOUNT_COHERENCY_BUFFERED); 2235 OCFS2_MOUNT_COHERENCY_BUFFERED);
2236 int unaligned_dio = 0;
2219 2237
2220 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2238 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2221 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2239 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2284,6 +2302,10 @@ relock:
2284 goto out; 2302 goto out;
2285 } 2303 }
2286 2304
2305 if (direct_io && !is_sync_kiocb(iocb))
2306 unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
2307 *ppos);
2308
2287 /* 2309 /*
2288 * We can't complete the direct I/O as requested, fall back to 2310 * We can't complete the direct I/O as requested, fall back to
2289 * buffered I/O. 2311 * buffered I/O.
@@ -2299,6 +2321,18 @@ relock:
2299 goto relock; 2321 goto relock;
2300 } 2322 }
2301 2323
2324 if (unaligned_dio) {
2325 /*
2326 * Wait on previous unaligned aio to complete before
2327 * proceeding.
2328 */
2329 ocfs2_aiodio_wait(inode);
2330
2331 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2332 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2333 ocfs2_iocb_set_unaligned_aio(iocb);
2334 }
2335
2302 /* 2336 /*
2303 * To later detect whether a journal commit for sync writes is 2337 * To later detect whether a journal commit for sync writes is
2304 * necessary, we sample i_size, and cluster count here. 2338 * necessary, we sample i_size, and cluster count here.
@@ -2371,8 +2405,12 @@ out_dio:
2371 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2405 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2372 rw_level = -1; 2406 rw_level = -1;
2373 have_alloc_sem = 0; 2407 have_alloc_sem = 0;
2408 unaligned_dio = 0;
2374 } 2409 }
2375 2410
2411 if (unaligned_dio)
2412 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
2413
2376out: 2414out:
2377 if (rw_level != -1) 2415 if (rw_level != -1)
2378 ocfs2_rw_unlock(inode, rw_level); 2416 ocfs2_rw_unlock(inode, rw_level);