aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2010-02-17 00:36:29 -0500
committerAlex Elder <aelder@sgi.com>2010-03-01 17:34:52 -0500
commit77d7a0c2eeb285c9069e15396703d0cb9690ac50 (patch)
tree22de501446dd5ba08581b04616408f90449f7211
parent66d834ea603d61bd90fedad90300ca91c5bba0a3 (diff)
xfs: Non-blocking inode locking in IO completion
The introduction of barriers to loop devices has created a new IO order completion dependency that XFS does not handle. The loop device implements barriers using fsync and so turns a log IO in the XFS filesystem on the loop device into a data IO in the backing filesystem. That is, the completion of log IOs in the loop filesystem are now dependent on completion of data IO in the backing filesystem. This can cause deadlocks when a flush daemon issues a log force with an inode locked because the IO completion of IO on the inode is blocked by the inode lock. This in turn prevents further data IO completion from occuring on all XFS filesystems on that CPU (due to the shared nature of the completion queues). This then prevents the log IO from completing because the log is waiting for data IO completion as well. The fix for this new completion order dependency issue is to make the IO completion inode locking non-blocking. If the inode lock can't be grabbed, simply requeue the IO completion back to the work queue so that it can be processed later. This prevents the completion queue from being blocked and allows data IO completion on other inodes to proceed, hence avoiding completion order dependent deadlocks. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c93
1 files changed, 56 insertions, 37 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index ce369a816ce3..b493c63976cd 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -163,14 +163,17 @@ xfs_ioend_new_eof(
163} 163}
164 164
165/* 165/*
166 * Update on-disk file size now that data has been written to disk. 166 * Update on-disk file size now that data has been written to disk. The
167 * The current in-memory file size is i_size. If a write is beyond 167 * current in-memory file size is i_size. If a write is beyond eof i_new_size
168 * eof i_new_size will be the intended file size until i_size is 168 * will be the intended file size until i_size is updated. If this write does
169 * updated. If this write does not extend all the way to the valid 169 * not extend all the way to the valid file size then restrict this update to
170 * file size then restrict this update to the end of the write. 170 * the end of the write.
171 *
172 * This function does not block as blocking on the inode lock in IO completion
173 * can lead to IO completion order dependency deadlocks.. If it can't get the
174 * inode ilock it will return EAGAIN. Callers must handle this.
171 */ 175 */
172 176STATIC int
173STATIC void
174xfs_setfilesize( 177xfs_setfilesize(
175 xfs_ioend_t *ioend) 178 xfs_ioend_t *ioend)
176{ 179{
@@ -181,9 +184,11 @@ xfs_setfilesize(
181 ASSERT(ioend->io_type != IOMAP_READ); 184 ASSERT(ioend->io_type != IOMAP_READ);
182 185
183 if (unlikely(ioend->io_error)) 186 if (unlikely(ioend->io_error))
184 return; 187 return 0;
188
189 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
190 return EAGAIN;
185 191
186 xfs_ilock(ip, XFS_ILOCK_EXCL);
187 isize = xfs_ioend_new_eof(ioend); 192 isize = xfs_ioend_new_eof(ioend);
188 if (isize) { 193 if (isize) {
189 ip->i_d.di_size = isize; 194 ip->i_d.di_size = isize;
@@ -191,6 +196,28 @@ xfs_setfilesize(
191 } 196 }
192 197
193 xfs_iunlock(ip, XFS_ILOCK_EXCL); 198 xfs_iunlock(ip, XFS_ILOCK_EXCL);
199 return 0;
200}
201
202/*
203 * Schedule IO completion handling on a xfsdatad if this was
204 * the final hold on this ioend. If we are asked to wait,
205 * flush the workqueue.
206 */
207STATIC void
208xfs_finish_ioend(
209 xfs_ioend_t *ioend,
210 int wait)
211{
212 if (atomic_dec_and_test(&ioend->io_remaining)) {
213 struct workqueue_struct *wq;
214
215 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
216 xfsconvertd_workqueue : xfsdatad_workqueue;
217 queue_work(wq, &ioend->io_work);
218 if (wait)
219 flush_workqueue(wq);
220 }
194} 221}
195 222
196/* 223/*
@@ -198,11 +225,11 @@ xfs_setfilesize(
198 */ 225 */
199STATIC void 226STATIC void
200xfs_end_io( 227xfs_end_io(
201 struct work_struct *work) 228 struct work_struct *work)
202{ 229{
203 xfs_ioend_t *ioend = 230 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
204 container_of(work, xfs_ioend_t, io_work); 231 struct xfs_inode *ip = XFS_I(ioend->io_inode);
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 232 int error;
206 233
207 /* 234 /*
208 * For unwritten extents we need to issue transactions to convert a 235 * For unwritten extents we need to issue transactions to convert a
@@ -210,7 +237,6 @@ xfs_end_io(
210 */ 237 */
211 if (ioend->io_type == IOMAP_UNWRITTEN && 238 if (ioend->io_type == IOMAP_UNWRITTEN &&
212 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { 239 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
213 int error;
214 240
215 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 241 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
216 ioend->io_size); 242 ioend->io_size);
@@ -222,30 +248,23 @@ xfs_end_io(
222 * We might have to update the on-disk file size after extending 248 * We might have to update the on-disk file size after extending
223 * writes. 249 * writes.
224 */ 250 */
225 if (ioend->io_type != IOMAP_READ) 251 if (ioend->io_type != IOMAP_READ) {
226 xfs_setfilesize(ioend); 252 error = xfs_setfilesize(ioend);
227 xfs_destroy_ioend(ioend); 253 ASSERT(!error || error == EAGAIN);
228}
229
230/*
231 * Schedule IO completion handling on a xfsdatad if this was
232 * the final hold on this ioend. If we are asked to wait,
233 * flush the workqueue.
234 */
235STATIC void
236xfs_finish_ioend(
237 xfs_ioend_t *ioend,
238 int wait)
239{
240 if (atomic_dec_and_test(&ioend->io_remaining)) {
241 struct workqueue_struct *wq;
242
243 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
244 xfsconvertd_workqueue : xfsdatad_workqueue;
245 queue_work(wq, &ioend->io_work);
246 if (wait)
247 flush_workqueue(wq);
248 } 254 }
255
256 /*
257 * If we didn't complete processing of the ioend, requeue it to the
258 * tail of the workqueue for another attempt later. Otherwise destroy
259 * it.
260 */
261 if (error == EAGAIN) {
262 atomic_inc(&ioend->io_remaining);
263 xfs_finish_ioend(ioend, 0);
264 /* ensure we don't spin on blocked ioends */
265 delay(1);
266 } else
267 xfs_destroy_ioend(ioend);
249} 268}
250 269
251/* 270/*