diff options
author | Dave Chinner <david@fromorbit.com> | 2010-02-17 00:36:29 -0500 |
---|---|---|
committer | Alex Elder <aelder@sgi.com> | 2010-03-01 17:34:52 -0500 |
commit | 77d7a0c2eeb285c9069e15396703d0cb9690ac50 (patch) | |
tree | 22de501446dd5ba08581b04616408f90449f7211 /fs | |
parent | 66d834ea603d61bd90fedad90300ca91c5bba0a3 (diff) |
xfs: Non-blocking inode locking in IO completion
The introduction of barriers to loop devices has created a new IO
order completion dependency that XFS does not handle. The loop
device implements barriers using fsync and so turns a log IO in the
XFS filesystem on the loop device into a data IO in the backing
filesystem. That is, the completion of log IOs in the loop
filesystem are now dependent on completion of data IO in the backing
filesystem.
This can cause deadlocks when a flush daemon issues a log force with
an inode locked because the IO completion of IO on the inode is
blocked by the inode lock. This in turn prevents further data IO
completion from occuring on all XFS filesystems on that CPU (due to
the shared nature of the completion queues). This then prevents the
log IO from completing because the log is waiting for data IO
completion as well.
The fix for this new completion order dependency issue is to make
the IO completion inode locking non-blocking. If the inode lock
can't be grabbed, simply requeue the IO completion back to the work
queue so that it can be processed later. This prevents the
completion queue from being blocked and allows data IO completion on
other inodes to proceed, hence avoiding completion order dependent
deadlocks.
Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 93 |
1 files changed, 56 insertions, 37 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index ce369a816ce3..b493c63976cd 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c | |||
@@ -163,14 +163,17 @@ xfs_ioend_new_eof( | |||
163 | } | 163 | } |
164 | 164 | ||
165 | /* | 165 | /* |
166 | * Update on-disk file size now that data has been written to disk. | 166 | * Update on-disk file size now that data has been written to disk. The |
167 | * The current in-memory file size is i_size. If a write is beyond | 167 | * current in-memory file size is i_size. If a write is beyond eof i_new_size |
168 | * eof i_new_size will be the intended file size until i_size is | 168 | * will be the intended file size until i_size is updated. If this write does |
169 | * updated. If this write does not extend all the way to the valid | 169 | * not extend all the way to the valid file size then restrict this update to |
170 | * file size then restrict this update to the end of the write. | 170 | * the end of the write. |
171 | * | ||
172 | * This function does not block as blocking on the inode lock in IO completion | ||
173 | * can lead to IO completion order dependency deadlocks.. If it can't get the | ||
174 | * inode ilock it will return EAGAIN. Callers must handle this. | ||
171 | */ | 175 | */ |
172 | 176 | STATIC int | |
173 | STATIC void | ||
174 | xfs_setfilesize( | 177 | xfs_setfilesize( |
175 | xfs_ioend_t *ioend) | 178 | xfs_ioend_t *ioend) |
176 | { | 179 | { |
@@ -181,9 +184,11 @@ xfs_setfilesize( | |||
181 | ASSERT(ioend->io_type != IOMAP_READ); | 184 | ASSERT(ioend->io_type != IOMAP_READ); |
182 | 185 | ||
183 | if (unlikely(ioend->io_error)) | 186 | if (unlikely(ioend->io_error)) |
184 | return; | 187 | return 0; |
188 | |||
189 | if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) | ||
190 | return EAGAIN; | ||
185 | 191 | ||
186 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
187 | isize = xfs_ioend_new_eof(ioend); | 192 | isize = xfs_ioend_new_eof(ioend); |
188 | if (isize) { | 193 | if (isize) { |
189 | ip->i_d.di_size = isize; | 194 | ip->i_d.di_size = isize; |
@@ -191,6 +196,28 @@ xfs_setfilesize( | |||
191 | } | 196 | } |
192 | 197 | ||
193 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 198 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
199 | return 0; | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * Schedule IO completion handling on a xfsdatad if this was | ||
204 | * the final hold on this ioend. If we are asked to wait, | ||
205 | * flush the workqueue. | ||
206 | */ | ||
207 | STATIC void | ||
208 | xfs_finish_ioend( | ||
209 | xfs_ioend_t *ioend, | ||
210 | int wait) | ||
211 | { | ||
212 | if (atomic_dec_and_test(&ioend->io_remaining)) { | ||
213 | struct workqueue_struct *wq; | ||
214 | |||
215 | wq = (ioend->io_type == IOMAP_UNWRITTEN) ? | ||
216 | xfsconvertd_workqueue : xfsdatad_workqueue; | ||
217 | queue_work(wq, &ioend->io_work); | ||
218 | if (wait) | ||
219 | flush_workqueue(wq); | ||
220 | } | ||
194 | } | 221 | } |
195 | 222 | ||
196 | /* | 223 | /* |
@@ -198,11 +225,11 @@ xfs_setfilesize( | |||
198 | */ | 225 | */ |
199 | STATIC void | 226 | STATIC void |
200 | xfs_end_io( | 227 | xfs_end_io( |
201 | struct work_struct *work) | 228 | struct work_struct *work) |
202 | { | 229 | { |
203 | xfs_ioend_t *ioend = | 230 | xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); |
204 | container_of(work, xfs_ioend_t, io_work); | 231 | struct xfs_inode *ip = XFS_I(ioend->io_inode); |
205 | struct xfs_inode *ip = XFS_I(ioend->io_inode); | 232 | int error; |
206 | 233 | ||
207 | /* | 234 | /* |
208 | * For unwritten extents we need to issue transactions to convert a | 235 | * For unwritten extents we need to issue transactions to convert a |
@@ -210,7 +237,6 @@ xfs_end_io( | |||
210 | */ | 237 | */ |
211 | if (ioend->io_type == IOMAP_UNWRITTEN && | 238 | if (ioend->io_type == IOMAP_UNWRITTEN && |
212 | likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { | 239 | likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { |
213 | int error; | ||
214 | 240 | ||
215 | error = xfs_iomap_write_unwritten(ip, ioend->io_offset, | 241 | error = xfs_iomap_write_unwritten(ip, ioend->io_offset, |
216 | ioend->io_size); | 242 | ioend->io_size); |
@@ -222,30 +248,23 @@ xfs_end_io( | |||
222 | * We might have to update the on-disk file size after extending | 248 | * We might have to update the on-disk file size after extending |
223 | * writes. | 249 | * writes. |
224 | */ | 250 | */ |
225 | if (ioend->io_type != IOMAP_READ) | 251 | if (ioend->io_type != IOMAP_READ) { |
226 | xfs_setfilesize(ioend); | 252 | error = xfs_setfilesize(ioend); |
227 | xfs_destroy_ioend(ioend); | 253 | ASSERT(!error || error == EAGAIN); |
228 | } | ||
229 | |||
230 | /* | ||
231 | * Schedule IO completion handling on a xfsdatad if this was | ||
232 | * the final hold on this ioend. If we are asked to wait, | ||
233 | * flush the workqueue. | ||
234 | */ | ||
235 | STATIC void | ||
236 | xfs_finish_ioend( | ||
237 | xfs_ioend_t *ioend, | ||
238 | int wait) | ||
239 | { | ||
240 | if (atomic_dec_and_test(&ioend->io_remaining)) { | ||
241 | struct workqueue_struct *wq; | ||
242 | |||
243 | wq = (ioend->io_type == IOMAP_UNWRITTEN) ? | ||
244 | xfsconvertd_workqueue : xfsdatad_workqueue; | ||
245 | queue_work(wq, &ioend->io_work); | ||
246 | if (wait) | ||
247 | flush_workqueue(wq); | ||
248 | } | 254 | } |
255 | |||
256 | /* | ||
257 | * If we didn't complete processing of the ioend, requeue it to the | ||
258 | * tail of the workqueue for another attempt later. Otherwise destroy | ||
259 | * it. | ||
260 | */ | ||
261 | if (error == EAGAIN) { | ||
262 | atomic_inc(&ioend->io_remaining); | ||
263 | xfs_finish_ioend(ioend, 0); | ||
264 | /* ensure we don't spin on blocked ioends */ | ||
265 | delay(1); | ||
266 | } else | ||
267 | xfs_destroy_ioend(ioend); | ||
249 | } | 268 | } |
250 | 269 | ||
251 | /* | 270 | /* |