diff options
author | Dmitry Monakhov <dmonakhov@openvz.org> | 2012-09-29 00:14:55 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2012-09-29 00:14:55 -0400 |
commit | 28a535f9a0df060569dcc786e5bc2e1de43d7dc7 (patch) | |
tree | 07db5cecf251794492ae92c0714d400423feb5cc /fs/ext4/page-io.c | |
parent | 82e54229118785badffb4ef5ba4803df25fe007f (diff) |
ext4: completed_io locking cleanup
Current unwritten extent conversion state-machine is very fuzzy.
- For unknown reason it performs conversion under i_mutex. What for?
My diagnosis:
We already protect extent tree with i_data_sem, truncate and punch_hole
should wait for DIO, so the only data we have to protect is end_io->flags
modification, but only flush_completed_IO and end_io_work modified this
flags and we can serialize them via i_completed_io_lock.
Currently all these games with mutex_trylock result in the following deadlock
truncate: kworker:
ext4_setattr ext4_end_io_work
mutex_lock(i_mutex)
inode_dio_wait(inode) ->BLOCK
DEADLOCK<- mutex_trylock()
inode_dio_done()
#TEST_CASE1_BEGIN
MNT=/mnt_scrach
unlink $MNT/file
fallocate -l $((1024*1024*1024)) $MNT/file
aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
sleep 2
truncate -s 0 $MNT/file
#TEST_CASE1_END
Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286
This patch makes state machine simple and clean:
(1) xxx_end_io schedule final extent conversion simply by calling
ext4_add_complete_io(), which append it to ei->i_completed_io_list
NOTE1: because of (2A) work should be queued only if
->i_completed_io_list was empty, otherwise the work is scheduled already.
(2) ext4_flush_completed_IO is responsible for handling all pending
end_io from ei->i_completed_io_list
Flushing sequence consists of following stages:
A) LOCKED: Atomically drain completed_io_list to local_list
B) Perform extents conversion
C) LOCKED: move converted io's to to_free list for final deletion
This logic depends on context which we was called from.
D) Final end_io context destruction
NOTE1: i_mutex is no longer required because end_io->flags modification
is protected by ei->ext4_complete_io_lock
Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
not good idea to punch blocks from corrupted inode.
Changes since V3 (in request to Jan's comments):
Fall back to active flush_completed_IO() approach in order to prevent
performance issues with nolocked DIO reads.
Changes since V2:
Fix use-after-free caused by race truncate vs end_io_work
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/page-io.c')
-rw-r--r-- | fs/ext4/page-io.c | 171 |
1 files changed, 113 insertions, 58 deletions
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 997002218228..5b24c407701b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -71,6 +71,7 @@ void ext4_free_io_end(ext4_io_end_t *io) | |||
71 | int i; | 71 | int i; |
72 | 72 | ||
73 | BUG_ON(!io); | 73 | BUG_ON(!io); |
74 | BUG_ON(!list_empty(&io->list)); | ||
74 | BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); | 75 | BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); |
75 | 76 | ||
76 | if (io->page) | 77 | if (io->page) |
@@ -83,21 +84,14 @@ void ext4_free_io_end(ext4_io_end_t *io) | |||
83 | kmem_cache_free(io_end_cachep, io); | 84 | kmem_cache_free(io_end_cachep, io); |
84 | } | 85 | } |
85 | 86 | ||
86 | /* | 87 | /* check a range of space and convert unwritten extents to written. */ |
87 | * check a range of space and convert unwritten extents to written. | 88 | static int ext4_end_io(ext4_io_end_t *io) |
88 | * | ||
89 | * Called with inode->i_mutex; we depend on this when we manipulate | ||
90 | * io->flag, since we could otherwise race with ext4_flush_completed_IO() | ||
91 | */ | ||
92 | int ext4_end_io_nolock(ext4_io_end_t *io) | ||
93 | { | 89 | { |
94 | struct inode *inode = io->inode; | 90 | struct inode *inode = io->inode; |
95 | loff_t offset = io->offset; | 91 | loff_t offset = io->offset; |
96 | ssize_t size = io->size; | 92 | ssize_t size = io->size; |
97 | int ret = 0; | 93 | int ret = 0; |
98 | 94 | ||
99 | BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); | ||
100 | |||
101 | ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," | 95 | ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," |
102 | "list->prev 0x%p\n", | 96 | "list->prev 0x%p\n", |
103 | io, inode->i_ino, io->list.next, io->list.prev); | 97 | io, inode->i_ino, io->list.next, io->list.prev); |
@@ -110,7 +104,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io) | |||
110 | "(inode %lu, offset %llu, size %zd, error %d)", | 104 | "(inode %lu, offset %llu, size %zd, error %d)", |
111 | inode->i_ino, offset, size, ret); | 105 | inode->i_ino, offset, size, ret); |
112 | } | 106 | } |
113 | io->flag &= ~EXT4_IO_END_UNWRITTEN; | ||
114 | if (io->iocb) | 107 | if (io->iocb) |
115 | aio_complete(io->iocb, io->result, 0); | 108 | aio_complete(io->iocb, io->result, 0); |
116 | 109 | ||
@@ -122,51 +115,122 @@ int ext4_end_io_nolock(ext4_io_end_t *io) | |||
122 | return ret; | 115 | return ret; |
123 | } | 116 | } |
124 | 117 | ||
125 | /* | 118 | static void dump_completed_IO(struct inode *inode) |
126 | * work on completed aio dio IO, to convert unwritten extents to extents | 119 | { |
127 | */ | 120 | #ifdef EXT4FS_DEBUG |
128 | static void ext4_end_io_work(struct work_struct *work) | 121 | struct list_head *cur, *before, *after; |
122 | ext4_io_end_t *io, *io0, *io1; | ||
123 | unsigned long flags; | ||
124 | |||
125 | if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { | ||
126 | ext4_debug("inode %lu completed_io list is empty\n", | ||
127 | inode->i_ino); | ||
128 | return; | ||
129 | } | ||
130 | |||
131 | ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); | ||
132 | list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { | ||
133 | cur = &io->list; | ||
134 | before = cur->prev; | ||
135 | io0 = container_of(before, ext4_io_end_t, list); | ||
136 | after = cur->next; | ||
137 | io1 = container_of(after, ext4_io_end_t, list); | ||
138 | |||
139 | ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", | ||
140 | io, inode->i_ino, io0, io1); | ||
141 | } | ||
142 | #endif | ||
143 | } | ||
144 | |||
145 | /* Add the io_end to per-inode completed end_io list. */ | ||
146 | void ext4_add_complete_io(ext4_io_end_t *io_end) | ||
129 | { | 147 | { |
130 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | 148 | struct ext4_inode_info *ei = EXT4_I(io_end->inode); |
131 | struct inode *inode = io->inode; | 149 | struct workqueue_struct *wq; |
132 | struct ext4_inode_info *ei = EXT4_I(inode); | 150 | unsigned long flags; |
133 | unsigned long flags; | 151 | |
152 | BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); | ||
153 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
134 | 154 | ||
135 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | 155 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
136 | if (io->flag & EXT4_IO_END_IN_FSYNC) | 156 | if (list_empty(&ei->i_completed_io_list)) { |
137 | goto requeue; | 157 | io_end->flag |= EXT4_IO_END_QUEUED; |
138 | if (list_empty(&io->list)) { | 158 | queue_work(wq, &io_end->work); |
139 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | ||
140 | goto free; | ||
141 | } | 159 | } |
160 | list_add_tail(&io_end->list, &ei->i_completed_io_list); | ||
161 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | ||
162 | } | ||
142 | 163 | ||
143 | if (!mutex_trylock(&inode->i_mutex)) { | 164 | static int ext4_do_flush_completed_IO(struct inode *inode, |
144 | bool was_queued; | 165 | ext4_io_end_t *work_io) |
145 | requeue: | 166 | { |
146 | was_queued = !!(io->flag & EXT4_IO_END_QUEUED); | 167 | ext4_io_end_t *io; |
147 | io->flag |= EXT4_IO_END_QUEUED; | 168 | struct list_head unwritten, complete, to_free; |
148 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | 169 | unsigned long flags; |
149 | /* | 170 | struct ext4_inode_info *ei = EXT4_I(inode); |
150 | * Requeue the work instead of waiting so that the work | 171 | int err, ret = 0; |
151 | * items queued after this can be processed. | 172 | |
152 | */ | 173 | INIT_LIST_HEAD(&complete); |
153 | queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); | 174 | INIT_LIST_HEAD(&to_free); |
154 | /* | 175 | |
155 | * To prevent the ext4-dio-unwritten thread from keeping | 176 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
156 | * requeueing end_io requests and occupying cpu for too long, | 177 | dump_completed_IO(inode); |
157 | * yield the cpu if it sees an end_io request that has already | 178 | list_replace_init(&ei->i_completed_io_list, &unwritten); |
158 | * been requeued. | 179 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
159 | */ | 180 | |
160 | if (was_queued) | 181 | while (!list_empty(&unwritten)) { |
161 | yield(); | 182 | io = list_entry(unwritten.next, ext4_io_end_t, list); |
162 | return; | 183 | BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); |
184 | list_del_init(&io->list); | ||
185 | |||
186 | err = ext4_end_io(io); | ||
187 | if (unlikely(!ret && err)) | ||
188 | ret = err; | ||
189 | |||
190 | list_add_tail(&io->list, &complete); | ||
191 | } | ||
192 | /* It is important to update all flags for all end_io in one shot w/o | ||
193 | * dropping the lock.*/ | ||
194 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | ||
195 | while (!list_empty(&complete)) { | ||
196 | io = list_entry(complete.next, ext4_io_end_t, list); | ||
197 | io->flag &= ~EXT4_IO_END_UNWRITTEN; | ||
198 | /* end_io context can not be destroyed now because it still | ||
199 | * used by queued worker. Worker thread will destroy it later */ | ||
200 | if (io->flag & EXT4_IO_END_QUEUED) | ||
201 | list_del_init(&io->list); | ||
202 | else | ||
203 | list_move(&io->list, &to_free); | ||
204 | } | ||
205 | /* If we are called from worker context, it is time to clear queued | ||
206 | * flag, and destroy it's end_io if it was converted already */ | ||
207 | if (work_io) { | ||
208 | work_io->flag &= ~EXT4_IO_END_QUEUED; | ||
209 | if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) | ||
210 | list_add_tail(&work_io->list, &to_free); | ||
163 | } | 211 | } |
164 | list_del_init(&io->list); | ||
165 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | 212 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
166 | (void) ext4_end_io_nolock(io); | 213 | |
167 | mutex_unlock(&inode->i_mutex); | 214 | while (!list_empty(&to_free)) { |
168 | free: | 215 | io = list_entry(to_free.next, ext4_io_end_t, list); |
169 | ext4_free_io_end(io); | 216 | list_del_init(&io->list); |
217 | ext4_free_io_end(io); | ||
218 | } | ||
219 | return ret; | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * work on completed aio dio IO, to convert unwritten extents to extents | ||
224 | */ | ||
225 | static void ext4_end_io_work(struct work_struct *work) | ||
226 | { | ||
227 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
228 | ext4_do_flush_completed_IO(io->inode, io); | ||
229 | } | ||
230 | |||
231 | int ext4_flush_completed_IO(struct inode *inode) | ||
232 | { | ||
233 | return ext4_do_flush_completed_IO(inode, NULL); | ||
170 | } | 234 | } |
171 | 235 | ||
172 | ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) | 236 | ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) |
@@ -199,9 +263,7 @@ static void buffer_io_error(struct buffer_head *bh) | |||
199 | static void ext4_end_bio(struct bio *bio, int error) | 263 | static void ext4_end_bio(struct bio *bio, int error) |
200 | { | 264 | { |
201 | ext4_io_end_t *io_end = bio->bi_private; | 265 | ext4_io_end_t *io_end = bio->bi_private; |
202 | struct workqueue_struct *wq; | ||
203 | struct inode *inode; | 266 | struct inode *inode; |
204 | unsigned long flags; | ||
205 | int i; | 267 | int i; |
206 | sector_t bi_sector = bio->bi_sector; | 268 | sector_t bi_sector = bio->bi_sector; |
207 | 269 | ||
@@ -259,14 +321,7 @@ static void ext4_end_bio(struct bio *bio, int error) | |||
259 | return; | 321 | return; |
260 | } | 322 | } |
261 | 323 | ||
262 | /* Add the io_end to per-inode completed io list*/ | 324 | ext4_add_complete_io(io_end); |
263 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
264 | list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); | ||
265 | spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
266 | |||
267 | wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; | ||
268 | /* queue the work to convert unwritten extents to written */ | ||
269 | queue_work(wq, &io_end->work); | ||
270 | } | 325 | } |
271 | 326 | ||
272 | void ext4_io_submit(struct ext4_io_submit *io) | 327 | void ext4_io_submit(struct ext4_io_submit *io) |