aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Fasheh <mfasheh@suse.com>2011-06-22 17:23:38 -0400
committerJoel Becker <jlbec@evilplan.org>2011-07-28 05:07:16 -0400
commita11f7e63c59810a81494d4c4b028af707d4c7ca4 (patch)
tree6d28cfc9519f96db5c20780bf765de9e0fc03bef
parent730e663bd82c1a10a85ff00728d34152a5a67ec8 (diff)
ocfs2: serialize unaligned aio
Fix a corruption that can happen when we have (two or more) outstanding aio's to an overlapping unaligned region. Ext4 (e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix similar issues. In our case what happens is that we can have an outstanding aio on a region and if a write comes in with some bytes overlapping the original aio we may decide to read that region into a page before continuing (typically because of buffered-io fallback). Since we have no ordering guarantees with the aio, we can read stale or bad data into the page and then write it back out. If the i/o is page and block aligned, then we avoid this issue as there won't be any need to read data from disk. I took the same approach as Eric in the ext4 patch and introduced some serialization of unaligned async direct i/o. I don't expect this to have an effect on the most common cases of AIO. Unaligned aio will be slower though, but that's far more acceptable than data corruption. Signed-off-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <jlbec@evilplan.org>
-rw-r--r--fs/ocfs2/aops.c10
-rw-r--r--fs/ocfs2/aops.h14
-rw-r--r--fs/ocfs2/file.c38
-rw-r--r--fs/ocfs2/inode.h3
-rw-r--r--fs/ocfs2/super.c10
5 files changed, 73 insertions, 2 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ac97bca282d2..4c1ec8f2d8c1 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -564,6 +564,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
564{ 564{
565 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 565 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
566 int level; 566 int level;
567 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
567 568
568 /* this io's submitter should not have unlocked this before we could */ 569 /* this io's submitter should not have unlocked this before we could */
569 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 570 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -573,6 +574,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
573 ocfs2_iocb_clear_sem_locked(iocb); 574 ocfs2_iocb_clear_sem_locked(iocb);
574 } 575 }
575 576
577 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
578 ocfs2_iocb_clear_unaligned_aio(iocb);
579
580 if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
581 waitqueue_active(wq)) {
582 wake_up_all(wq);
583 }
584 }
585
576 ocfs2_iocb_clear_rw_locked(iocb); 586 ocfs2_iocb_clear_rw_locked(iocb);
577 587
578 level = ocfs2_iocb_rw_locked_level(iocb); 588 level = ocfs2_iocb_rw_locked_level(iocb);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 75cf3ad987a6..ffb2da370a99 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
78 OCFS2_IOCB_RW_LOCK = 0, 78 OCFS2_IOCB_RW_LOCK = 0,
79 OCFS2_IOCB_RW_LOCK_LEVEL, 79 OCFS2_IOCB_RW_LOCK_LEVEL,
80 OCFS2_IOCB_SEM, 80 OCFS2_IOCB_SEM,
81 OCFS2_IOCB_UNALIGNED_IO,
81 OCFS2_IOCB_NUM_LOCKS 82 OCFS2_IOCB_NUM_LOCKS
82}; 83};
83 84
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
91 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) 92 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
92#define ocfs2_iocb_is_sem_locked(iocb) \ 93#define ocfs2_iocb_is_sem_locked(iocb) \
93 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) 94 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
95
96#define ocfs2_iocb_set_unaligned_aio(iocb) \
97 set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
98#define ocfs2_iocb_clear_unaligned_aio(iocb) \
99 clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
100#define ocfs2_iocb_is_unaligned_aio(iocb) \
101 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
102
103#define OCFS2_IOEND_WQ_HASH_SZ 37
104#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
105 OCFS2_IOEND_WQ_HASH_SZ])
106extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
107
94#endif /* OCFS2_FILE_H */ 108#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b1e35a392ca5..145f4533a936 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2038,6 +2038,23 @@ out:
2038 return ret; 2038 return ret;
2039} 2039}
2040 2040
2041static void ocfs2_aiodio_wait(struct inode *inode)
2042{
2043 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2044
2045 wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2046}
2047
2048static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2049{
2050 int blockmask = inode->i_sb->s_blocksize - 1;
2051 loff_t final_size = pos + count;
2052
2053 if ((pos & blockmask) || (final_size & blockmask))
2054 return 1;
2055 return 0;
2056}
2057
2041static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2058static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2042 struct file *file, 2059 struct file *file,
2043 loff_t pos, size_t count, 2060 loff_t pos, size_t count,
@@ -2216,6 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2216 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2217 int full_coherency = !(osb->s_mount_opt & 2234 int full_coherency = !(osb->s_mount_opt &
2218 OCFS2_MOUNT_COHERENCY_BUFFERED); 2235 OCFS2_MOUNT_COHERENCY_BUFFERED);
2236 int unaligned_dio = 0;
2219 2237
2220 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2238 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2221 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2239 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2284,6 +2302,10 @@ relock:
2284 goto out; 2302 goto out;
2285 } 2303 }
2286 2304
2305 if (direct_io && !is_sync_kiocb(iocb))
2306 unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
2307 *ppos);
2308
2287 /* 2309 /*
2288 * We can't complete the direct I/O as requested, fall back to 2310 * We can't complete the direct I/O as requested, fall back to
2289 * buffered I/O. 2311 * buffered I/O.
@@ -2299,6 +2321,18 @@ relock:
2299 goto relock; 2321 goto relock;
2300 } 2322 }
2301 2323
2324 if (unaligned_dio) {
2325 /*
2326 * Wait on previous unaligned aio to complete before
2327 * proceeding.
2328 */
2329 ocfs2_aiodio_wait(inode);
2330
2331 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2332 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2333 ocfs2_iocb_set_unaligned_aio(iocb);
2334 }
2335
2302 /* 2336 /*
2303 * To later detect whether a journal commit for sync writes is 2337 * To later detect whether a journal commit for sync writes is
2304 * necessary, we sample i_size, and cluster count here. 2338 * necessary, we sample i_size, and cluster count here.
@@ -2371,8 +2405,12 @@ out_dio:
2371 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2405 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2372 rw_level = -1; 2406 rw_level = -1;
2373 have_alloc_sem = 0; 2407 have_alloc_sem = 0;
2408 unaligned_dio = 0;
2374 } 2409 }
2375 2410
2411 if (unaligned_dio)
2412 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
2413
2376out: 2414out:
2377 if (rw_level != -1) 2415 if (rw_level != -1)
2378 ocfs2_rw_unlock(inode, rw_level); 2416 ocfs2_rw_unlock(inode, rw_level);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1c508b149b3a..88924a3133fa 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
43 /* protects extended attribute changes on this inode */ 43 /* protects extended attribute changes on this inode */
44 struct rw_semaphore ip_xattr_sem; 44 struct rw_semaphore ip_xattr_sem;
45 45
46 /* Number of outstanding AIO's which are not page aligned */
47 atomic_t ip_unaligned_aio;
48
46 /* These fields are protected by ip_lock */ 49 /* These fields are protected by ip_lock */
47 spinlock_t ip_lock; 50 spinlock_t ip_lock;
48 u32 ip_open_count; 51 u32 ip_open_count;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 029c4cd8a691..603f5fe9f816 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -54,6 +54,7 @@
54#include "ocfs1_fs_compat.h" 54#include "ocfs1_fs_compat.h"
55 55
56#include "alloc.h" 56#include "alloc.h"
57#include "aops.h"
57#include "blockcheck.h" 58#include "blockcheck.h"
58#include "dlmglue.h" 59#include "dlmglue.h"
59#include "export.h" 60#include "export.h"
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1616 return 0; 1617 return 0;
1617} 1618}
1618 1619
1620wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
1621
1619static int __init ocfs2_init(void) 1622static int __init ocfs2_init(void)
1620{ 1623{
1621 int status; 1624 int status, i;
1622 1625
1623 ocfs2_print_version(); 1626 ocfs2_print_version();
1624 1627
1628 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
1629 init_waitqueue_head(&ocfs2__ioend_wq[i]);
1630
1625 status = init_ocfs2_uptodate_cache(); 1631 status = init_ocfs2_uptodate_cache();
1626 if (status < 0) { 1632 if (status < 0) {
1627 mlog_errno(status); 1633 mlog_errno(status);
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data)
1760 ocfs2_extent_map_init(&oi->vfs_inode); 1766 ocfs2_extent_map_init(&oi->vfs_inode);
1761 INIT_LIST_HEAD(&oi->ip_io_markers); 1767 INIT_LIST_HEAD(&oi->ip_io_markers);
1762 oi->ip_dir_start_lookup = 0; 1768 oi->ip_dir_start_lookup = 0;
1763 1769 atomic_set(&oi->ip_unaligned_aio, 0);
1764 init_rwsem(&oi->ip_alloc_sem); 1770 init_rwsem(&oi->ip_alloc_sem);
1765 init_rwsem(&oi->ip_xattr_sem); 1771 init_rwsem(&oi->ip_xattr_sem);
1766 mutex_init(&oi->ip_io_mutex); 1772 mutex_init(&oi->ip_io_mutex);