aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_aops.c
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2012-11-27 21:01:00 -0500
committerBen Myers <bpm@sgi.com>2012-11-29 15:22:56 -0500
commit437a255aa23766666aec78af63be4c253faa8d57 (patch)
treefe35390067b0b0f3efc47420c454d6d6de7892c6 /fs/xfs/xfs_aops.c
parentef9d873344ff9f5084eacb9f3735982314dfda9e (diff)
xfs: fix direct IO nested transaction deadlock.
The direct IO path can do a nested transaction reservation when writing past the EOF. The first transaction is the append transaction for setting the filesize at IO completion, but we can also need a transaction for allocation of blocks. If the log is low on space due to reservations and small log, the append transaction can be granted after wating for space as the only active transaction in the system. This then attempts a reservation for an allocation, which there isn't space in the log for, and the reservation sleeps. The result is that there is nothing left in the system to wake up all the processes waiting for log space to come free. The stack trace that shows this deadlock is relatively innocuous: xlog_grant_head_wait xlog_grant_head_check xfs_log_reserve xfs_trans_reserve xfs_iomap_write_direct __xfs_get_blocks xfs_get_blocks_direct do_blockdev_direct_IO __blockdev_direct_IO xfs_vm_direct_IO generic_file_direct_write xfs_file_dio_aio_writ xfs_file_aio_write do_sync_write vfs_write This was discovered on a filesystem with a log of only 10MB, and a log stripe unit of 256k whih increased the base reservations by 512k. Hence a allocation transaction requires 1.2MB of log space to be available instead of only 260k, and so greatly increased the chance that there wouldn't be enough log space available for the nested transaction to succeed. The key to reproducing it is this mkfs command: mkfs.xfs -f -d agcount=16,su=256k,sw=12 -l su=256k,size=2560b $SCRATCH_DEV The test case was a 1000 fsstress processes running with random freeze and unfreezes every few seconds. Thanks to Eryu Guan (eguan@redhat.com) for writing the test that found this on a system with a somewhat unique default configuration.... cc: <stable@vger.kernel.org> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Andrew Dahl <adahl@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_aops.c')
-rw-r--r--fs/xfs/xfs_aops.c81
1 files changed, 29 insertions, 52 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 71361da1f77c..4111a40ebe1a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
124 ioend->io_append_trans = tp; 124 ioend->io_append_trans = tp;
125 125
126 /* 126 /*
127 * We will pass freeze protection with a transaction. So tell lockdep 127 * We may pass freeze protection with a transaction. So tell lockdep
128 * we released it. 128 * we released it.
129 */ 129 */
130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
149 xfs_fsize_t isize; 149 xfs_fsize_t isize;
150 150
151 /* 151 /*
152 * The transaction was allocated in the I/O submission thread, 152 * The transaction may have been allocated in the I/O submission thread,
153 * thus we need to mark ourselves as beeing in a transaction 153 * thus we need to mark ourselves as beeing in a transaction manually.
154 * manually. 154 * Similarly for freeze protection.
155 */ 155 */
156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
157 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
158 0, 1, _THIS_IP_);
157 159
158 xfs_ilock(ip, XFS_ILOCK_EXCL); 160 xfs_ilock(ip, XFS_ILOCK_EXCL);
159 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); 161 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
187 189
188 if (ioend->io_type == XFS_IO_UNWRITTEN) 190 if (ioend->io_type == XFS_IO_UNWRITTEN)
189 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 191 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
190 else if (ioend->io_append_trans) 192 else if (ioend->io_append_trans ||
193 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
191 queue_work(mp->m_data_workqueue, &ioend->io_work); 194 queue_work(mp->m_data_workqueue, &ioend->io_work);
192 else 195 else
193 xfs_destroy_ioend(ioend); 196 xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 208 struct xfs_inode *ip = XFS_I(ioend->io_inode);
206 int error = 0; 209 int error = 0;
207 210
208 if (ioend->io_append_trans) {
209 /*
210 * We've got freeze protection passed with the transaction.
211 * Tell lockdep about it.
212 */
213 rwsem_acquire_read(
214 &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
215 0, 1, _THIS_IP_);
216 }
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 211 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
218 ioend->io_error = -EIO; 212 ioend->io_error = -EIO;
219 goto done; 213 goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
226 * range to normal written extens after the data I/O has finished. 220 * range to normal written extens after the data I/O has finished.
227 */ 221 */
228 if (ioend->io_type == XFS_IO_UNWRITTEN) { 222 if (ioend->io_type == XFS_IO_UNWRITTEN) {
223 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
224 ioend->io_size);
225 } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
229 /* 226 /*
230 * For buffered I/O we never preallocate a transaction when 227 * For direct I/O we do not know if we need to allocate blocks
231 * doing the unwritten extent conversion, but for direct I/O 228 * or not so we can't preallocate an append transaction as that
232 * we do not know if we are converting an unwritten extent 229 * results in nested reservations and log space deadlocks. Hence
233 * or not at the point where we preallocate the transaction. 230 * allocate the transaction here. While this is sub-optimal and
231 * can block IO completion for some time, we're stuck with doing
232 * it this way until we can pass the ioend to the direct IO
233 * allocation callbacks and avoid nesting that way.
234 */ 234 */
235 if (ioend->io_append_trans) { 235 error = xfs_setfilesize_trans_alloc(ioend);
236 ASSERT(ioend->io_isdirect); 236 if (error)
237
238 current_set_flags_nested(
239 &ioend->io_append_trans->t_pflags, PF_FSTRANS);
240 xfs_trans_cancel(ioend->io_append_trans, 0);
241 }
242
243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
244 ioend->io_size);
245 if (error) {
246 ioend->io_error = -error;
247 goto done; 237 goto done;
248 } 238 error = xfs_setfilesize(ioend);
249 } else if (ioend->io_append_trans) { 239 } else if (ioend->io_append_trans) {
250 error = xfs_setfilesize(ioend); 240 error = xfs_setfilesize(ioend);
251 if (error)
252 ioend->io_error = -error;
253 } else { 241 } else {
254 ASSERT(!xfs_ioend_is_append(ioend)); 242 ASSERT(!xfs_ioend_is_append(ioend));
255 } 243 }
256 244
257done: 245done:
246 if (error)
247 ioend->io_error = -error;
258 xfs_destroy_ioend(ioend); 248 xfs_destroy_ioend(ioend);
259} 249}
260 250
@@ -1432,25 +1422,21 @@ xfs_vm_direct_IO(
1432 size_t size = iov_length(iov, nr_segs); 1422 size_t size = iov_length(iov, nr_segs);
1433 1423
1434 /* 1424 /*
1435 * We need to preallocate a transaction for a size update 1425 * We cannot preallocate a size update transaction here as we
1436 * here. In the case that this write both updates the size 1426 * don't know whether allocation is necessary or not. Hence we
1437 * and converts at least on unwritten extent we will cancel 1427 * can only tell IO completion that one is necessary if we are
1438 * the still clean transaction after the I/O has finished. 1428 * not doing unwritten extent conversion.
1439 */ 1429 */
1440 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); 1430 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
1441 if (offset + size > XFS_I(inode)->i_d.di_size) { 1431 if (offset + size > XFS_I(inode)->i_d.di_size)
1442 ret = xfs_setfilesize_trans_alloc(ioend);
1443 if (ret)
1444 goto out_destroy_ioend;
1445 ioend->io_isdirect = 1; 1432 ioend->io_isdirect = 1;
1446 }
1447 1433
1448 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1434 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1449 offset, nr_segs, 1435 offset, nr_segs,
1450 xfs_get_blocks_direct, 1436 xfs_get_blocks_direct,
1451 xfs_end_io_direct_write, NULL, 0); 1437 xfs_end_io_direct_write, NULL, 0);
1452 if (ret != -EIOCBQUEUED && iocb->private) 1438 if (ret != -EIOCBQUEUED && iocb->private)
1453 goto out_trans_cancel; 1439 goto out_destroy_ioend;
1454 } else { 1440 } else {
1455 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1441 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1456 offset, nr_segs, 1442 offset, nr_segs,
@@ -1460,15 +1446,6 @@ xfs_vm_direct_IO(
1460 1446
1461 return ret; 1447 return ret;
1462 1448
1463out_trans_cancel:
1464 if (ioend->io_append_trans) {
1465 current_set_flags_nested(&ioend->io_append_trans->t_pflags,
1466 PF_FSTRANS);
1467 rwsem_acquire_read(
1468 &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
1469 0, 1, _THIS_IP_);
1470 xfs_trans_cancel(ioend->io_append_trans, 0);
1471 }
1472out_destroy_ioend: 1449out_destroy_ioend:
1473 xfs_destroy_ioend(ioend); 1450 xfs_destroy_ioend(ioend);
1474 return ret; 1451 return ret;