aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Sandeen <sandeen@redhat.com>2011-02-12 08:17:34 -0500
committerTheodore Ts'o <tytso@mit.edu>2011-02-12 08:17:34 -0500
commite9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d (patch)
tree9f347a48889a00071dbe1f12be4c50ec7a45542b
parent2892c15ddda6a76dc10b7499e56c0f3b892e5a69 (diff)
ext4: serialize unaligned asynchronous DIO
ext4 has a data corruption case when doing non-block-aligned asynchronous direct IO into a sparse file, as demonstrated by xfstest 240. The root cause is that while ext4 preallocates space in the hole, mappings of that space still look "new" and dio_zero_block() will zero out the unwritten portions. When more than one AIO thread is going, they both find this "new" block and race to zero out their portion; this is uncoordinated and causes data corruption. Dave Chinner fixed this for xfs by simply serializing all unaligned asynchronous direct IO. I've done the same here. The difference is that we only wait on conversions, not all IO. This is a very big hammer, and I'm not very pleased with stuffing this into ext4_file_write(). But since ext4 is DIO_LOCKING, we need to serialize it at this high level. I tried to move this into ext4_ext_direct_IO, but by then we have the i_mutex already, and we will wait on the work queue to do conversions - which must also take the i_mutex. So that won't work. This was originally exposed by qemu-kvm installing to a raw disk image with a normal sector-63 alignment. I've tested a backport of this patch with qemu, and it does avoid the corruption. It is also quite a lot slower (14 min for package installs, vs. 8 min for well-aligned) but I'll take slow correctness over fast corruption any day. Mingming suggested that we can track outstanding conversions, and wait on those so that non-sparse files won't be affected, and I've implemented that here; unaligned AIO to nonsparse files won't take a perf hit. [tytso@mit.edu: Keep the mutex as a hashed array instead of bloating the ext4 inode] [tytso@mit.edu: Fix up namespace issues so that global variables are protected with an "ext4_" prefix.] Signed-off-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/ext4.h10
-rw-r--r--fs/ext4/extents.c10
-rw-r--r--fs/ext4/file.c60
-rw-r--r--fs/ext4/page-io.c25
-rw-r--r--fs/ext4/super.c13
5 files changed, 100 insertions, 18 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c8d97b56f34..3aa0b72b3b94 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
849 /* current io_end structure for async DIO write*/ 849 /* current io_end structure for async DIO write*/
850 ext4_io_end_t *cur_aio_dio; 850 ext4_io_end_t *cur_aio_dio;
851 atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
851 852
852 spinlock_t i_block_reservation_lock; 853 spinlock_t i_block_reservation_lock;
853 854
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
2119 2120
2120#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 2121#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2121 2122
2123/* For ioend & aio unwritten conversion wait queues */
2124#define EXT4_WQ_HASH_SZ 37
2125#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
2126 EXT4_WQ_HASH_SZ])
2127#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
2128 EXT4_WQ_HASH_SZ])
2129extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
2130extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
2131
2122#endif /* __KERNEL__ */ 2132#endif /* __KERNEL__ */
2123 2133
2124#endif /* _EXT4_H */ 2134#endif /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 63a75810b7c3..ccce8a7e94ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3174 * that this IO needs to convertion to written when IO is 3174 * that this IO needs to convertion to written when IO is
3175 * completed 3175 * completed
3176 */ 3176 */
3177 if (io) 3177 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3178 io->flag = EXT4_IO_END_UNWRITTEN; 3178 io->flag = EXT4_IO_END_UNWRITTEN;
3179 else 3179 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3180 } else
3180 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3181 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3181 if (ext4_should_dioread_nolock(inode)) 3182 if (ext4_should_dioread_nolock(inode))
3182 map->m_flags |= EXT4_MAP_UNINIT; 3183 map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3463 * that we need to perform convertion when IO is done. 3464 * that we need to perform convertion when IO is done.
3464 */ 3465 */
3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3466 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3466 if (io) 3467 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3467 io->flag = EXT4_IO_END_UNWRITTEN; 3468 io->flag = EXT4_IO_END_UNWRITTEN;
3468 else 3469 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3470 } else
3469 ext4_set_inode_state(inode, 3471 ext4_set_inode_state(inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3472 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3473 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e8322c8aa88..7b80d543b89e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
55 return 0; 55 return 0;
56} 56}
57 57
58static void ext4_aiodio_wait(struct inode *inode)
59{
60 wait_queue_head_t *wq = ext4_ioend_wq(inode);
61
62 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
63}
64
65/*
66 * This tests whether the IO in question is block-aligned or not.
67 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
68 * are converted to written only after the IO is complete. Until they are
69 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
70 * it needs to zero out portions of the start and/or end block. If 2 AIO
71 * threads are at work on the same unwritten block, they must be synchronized
72 * or one thread will zero the other's data, causing corruption.
73 */
74static int
75ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
76 unsigned long nr_segs, loff_t pos)
77{
78 struct super_block *sb = inode->i_sb;
79 int blockmask = sb->s_blocksize - 1;
80 size_t count = iov_length(iov, nr_segs);
81 loff_t final_size = pos + count;
82
83 if (pos >= inode->i_size)
84 return 0;
85
86 if ((pos & blockmask) || (final_size & blockmask))
87 return 1;
88
89 return 0;
90}
91
58static ssize_t 92static ssize_t
59ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 93ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
60 unsigned long nr_segs, loff_t pos) 94 unsigned long nr_segs, loff_t pos)
61{ 95{
62 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 96 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
97 int unaligned_aio = 0;
98 int ret;
63 99
64 /* 100 /*
65 * If we have encountered a bitmap-format file, the size limit 101 * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
78 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, 114 nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
79 sbi->s_bitmap_maxbytes - pos); 115 sbi->s_bitmap_maxbytes - pos);
80 } 116 }
117 } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
118 !is_sync_kiocb(iocb))) {
119 unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
81 } 120 }
82 121
83 return generic_file_aio_write(iocb, iov, nr_segs, pos); 122 /* Unaligned direct AIO must be serialized; see comment above */
123 if (unaligned_aio) {
124 static unsigned long unaligned_warn_time;
125
126 /* Warn about this once per day */
127 if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
128 ext4_msg(inode->i_sb, KERN_WARNING,
129 "Unaligned AIO/DIO on inode %ld by %s; "
130 "performance will be poor.",
131 inode->i_ino, current->comm);
132 mutex_lock(ext4_aio_mutex(inode));
133 ext4_aiodio_wait(inode);
134 }
135
136 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
137
138 if (unaligned_aio)
139 mutex_unlock(ext4_aio_mutex(inode));
140
141 return ret;
84} 142}
85 143
86static const struct vm_operations_struct ext4_file_vm_ops = { 144static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4e9b0a242f4c..955cc309142f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
32 32
33static struct kmem_cache *io_page_cachep, *io_end_cachep; 33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34 34
35#define WQ_HASH_SZ 37
36#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38
39int __init ext4_init_pageio(void) 35int __init ext4_init_pageio(void)
40{ 36{
41 int i;
42
43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); 37 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
44 if (io_page_cachep == NULL) 38 if (io_page_cachep == NULL)
45 return -ENOMEM; 39 return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
48 kmem_cache_destroy(io_page_cachep); 42 kmem_cache_destroy(io_page_cachep);
49 return -ENOMEM; 43 return -ENOMEM;
50 } 44 }
51 for (i = 0; i < WQ_HASH_SZ; i++)
52 init_waitqueue_head(&ioend_wq[i]);
53
54 return 0; 45 return 0;
55} 46}
56 47
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
62 53
63void ext4_ioend_wait(struct inode *inode) 54void ext4_ioend_wait(struct inode *inode)
64{ 55{
65 wait_queue_head_t *wq = to_ioend_wq(inode); 56 wait_queue_head_t *wq = ext4_ioend_wq(inode);
66 57
67 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 58 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
68} 59}
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
87 for (i = 0; i < io->num_io_pages; i++) 78 for (i = 0; i < io->num_io_pages; i++)
88 put_io_page(io->pages[i]); 79 put_io_page(io->pages[i]);
89 io->num_io_pages = 0; 80 io->num_io_pages = 0;
90 wq = to_ioend_wq(io->inode); 81 wq = ext4_ioend_wq(io->inode);
91 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && 82 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
92 waitqueue_active(wq)) 83 waitqueue_active(wq))
93 wake_up_all(wq); 84 wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
102 struct inode *inode = io->inode; 93 struct inode *inode = io->inode;
103 loff_t offset = io->offset; 94 loff_t offset = io->offset;
104 ssize_t size = io->size; 95 ssize_t size = io->size;
96 wait_queue_head_t *wq;
105 int ret = 0; 97 int ret = 0;
106 98
107 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 99 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
126 if (io->iocb) 118 if (io->iocb)
127 aio_complete(io->iocb, io->result, 0); 119 aio_complete(io->iocb, io->result, 0);
128 /* clear the DIO AIO unwritten flag */ 120 /* clear the DIO AIO unwritten flag */
129 io->flag &= ~EXT4_IO_END_UNWRITTEN; 121 if (io->flag & EXT4_IO_END_UNWRITTEN) {
122 io->flag &= ~EXT4_IO_END_UNWRITTEN;
123 /* Wake up anyone waiting on unwritten extent conversion */
124 wq = ext4_ioend_wq(io->inode);
125 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
126 waitqueue_active(wq)) {
127 wake_up_all(wq);
128 }
129 }
130
130 return ret; 131 return ret;
131} 132}
132 133
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 86b05486dc63..f6a318f836b2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -833,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
833 ei->i_sync_tid = 0; 833 ei->i_sync_tid = 0;
834 ei->i_datasync_tid = 0; 834 ei->i_datasync_tid = 0;
835 atomic_set(&ei->i_ioend_count, 0); 835 atomic_set(&ei->i_ioend_count, 0);
836 atomic_set(&ei->i_aiodio_unwritten, 0);
836 837
837 return &ei->vfs_inode; 838 return &ei->vfs_inode;
838} 839}
@@ -4800,11 +4801,21 @@ static void ext4_exit_feat_adverts(void)
4800 kfree(ext4_feat); 4801 kfree(ext4_feat);
4801} 4802}
4802 4803
4804/* Shared across all ext4 file systems */
4805wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
4806struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
4807
4803static int __init ext4_init_fs(void) 4808static int __init ext4_init_fs(void)
4804{ 4809{
4805 int err; 4810 int i, err;
4806 4811
4807 ext4_check_flag_values(); 4812 ext4_check_flag_values();
4813
4814 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
4815 mutex_init(&ext4__aio_mutex[i]);
4816 init_waitqueue_head(&ext4__ioend_wq[i]);
4817 }
4818
4808 err = ext4_init_pageio(); 4819 err = ext4_init_pageio();
4809 if (err) 4820 if (err)
4810 return err; 4821 return err;