aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2014-04-16 18:15:26 -0400
committerDave Chinner <david@fromorbit.com>2014-04-16 18:15:26 -0400
commit9c23eccc1e746f64b18fab070a37189b4422e44a (patch)
treef840c5e0e6bbaed4797ae819e1acca137a2c1d64 /fs
parentd39a2ced0fa0172faa46df0866fc22419b876e2a (diff)
xfs: unmount does not wait for shutdown during unmount
And interesting situation can occur if a log IO error occurs during the unmount of a filesystem. The cases reported have the same signature - the update of the superblock counters fails due to a log write IO error: XFS (dm-16): xfs_do_force_shutdown(0x2) called from line 1170 of file fs/xfs/xfs_log.c. Return address = 0xffffffffa08a44a1 XFS (dm-16): Log I/O Error Detected. Shutting down filesystem XFS (dm-16): Unable to update superblock counters. Freespace may not be correct on next mount. XFS (dm-16): xfs_log_force: error 5 returned. XFS (¿-¿¿¿): Please umount the filesystem and rectify the problem(s) It can be seen that the last line of output contains a corrupt device name - this is because the log and xfs_mount structures have already been freed by the time this message is printed. A kernel oops closely follows. The issue is that the shutdown is occurring in a separate IO completion thread to the unmount. Once the shutdown processing has started and all the iclogs are marked with XLOG_STATE_IOERROR, the log shutdown code wakes anyone waiting on a log force so they can process the shutdown error. This wakes up the unmount code that is doing a synchronous transaction to update the superblock counters. The unmount path now sees all the iclogs are marked with XLOG_STATE_IOERROR and so never waits on them again, knowing that if it does, there will not be a wakeup trigger for it and we will hang the unmount if we do. Hence the unmount runs through all the remaining code and frees all the filesystem structures while the xlog_iodone() is still processing the shutdown. When the log shutdown processing completes, xfs_do_force_shutdown() emits the "Please umount the filesystem and rectify the problem(s)" message, and xlog_iodone() then aborts all the objects attached to the iclog. An iclog that has already been freed.... The real issue here is that there is no serialisation point between the log IO and the unmount. We have serialisations points for log writes, log forces, reservations, etc, but we don't actually have any code that wakes for log IO to fully complete. We do that for all other types of object, so why not iclogbufs? Well, it turns out that we can easily do this. We've got xfs_buf handles, and that's what everyone else uses for IO serialisation. i.e. bp->b_sema. So, lets hold iclogbufs locked over IO, and only release the lock in xlog_iodone() when we are finished with the buffer. That way before we tear down the iclog, we can lock and unlock the buffer to ensure IO completion has finished completely before we tear it down. Signed-off-by: Dave Chinner <dchinner@redhat.com> Tested-by: Mike Snitzer <snitzer@redhat.com> Tested-by: Bob Mastors <bob.mastors@solidfire.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/xfs_log.c53
1 files changed, 44 insertions, 9 deletions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 8497a00e399d..08624dc67317 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1181,11 +1181,14 @@ xlog_iodone(xfs_buf_t *bp)
1181 /* log I/O is always issued ASYNC */ 1181 /* log I/O is always issued ASYNC */
1182 ASSERT(XFS_BUF_ISASYNC(bp)); 1182 ASSERT(XFS_BUF_ISASYNC(bp));
1183 xlog_state_done_syncing(iclog, aborted); 1183 xlog_state_done_syncing(iclog, aborted);
1184
1184 /* 1185 /*
1185 * do not reference the buffer (bp) here as we could race 1186 * drop the buffer lock now that we are done. Nothing references
1186 * with it being freed after writing the unmount record to the 1187 * the buffer after this, so an unmount waiting on this lock can now
1187 * log. 1188 * tear it down safely. As such, it is unsafe to reference the buffer
1189 * (bp) after the unlock as we could race with it being freed.
1188 */ 1190 */
1191 xfs_buf_unlock(bp);
1189} 1192}
1190 1193
1191/* 1194/*
@@ -1368,8 +1371,16 @@ xlog_alloc_log(
1368 bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); 1371 bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
1369 if (!bp) 1372 if (!bp)
1370 goto out_free_log; 1373 goto out_free_log;
1371 bp->b_iodone = xlog_iodone; 1374
1375 /*
1376 * The iclogbuf buffer locks are held over IO but we are not going to do
1377 * IO yet. Hence unlock the buffer so that the log IO path can grab it
1378 * when appropriately.
1379 */
1372 ASSERT(xfs_buf_islocked(bp)); 1380 ASSERT(xfs_buf_islocked(bp));
1381 xfs_buf_unlock(bp);
1382
1383 bp->b_iodone = xlog_iodone;
1373 log->l_xbuf = bp; 1384 log->l_xbuf = bp;
1374 1385
1375 spin_lock_init(&log->l_icloglock); 1386 spin_lock_init(&log->l_icloglock);
@@ -1398,6 +1409,9 @@ xlog_alloc_log(
1398 if (!bp) 1409 if (!bp)
1399 goto out_free_iclog; 1410 goto out_free_iclog;
1400 1411
1412 ASSERT(xfs_buf_islocked(bp));
1413 xfs_buf_unlock(bp);
1414
1401 bp->b_iodone = xlog_iodone; 1415 bp->b_iodone = xlog_iodone;
1402 iclog->ic_bp = bp; 1416 iclog->ic_bp = bp;
1403 iclog->ic_data = bp->b_addr; 1417 iclog->ic_data = bp->b_addr;
@@ -1422,7 +1436,6 @@ xlog_alloc_log(
1422 iclog->ic_callback_tail = &(iclog->ic_callback); 1436 iclog->ic_callback_tail = &(iclog->ic_callback);
1423 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; 1437 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
1424 1438
1425 ASSERT(xfs_buf_islocked(iclog->ic_bp));
1426 init_waitqueue_head(&iclog->ic_force_wait); 1439 init_waitqueue_head(&iclog->ic_force_wait);
1427 init_waitqueue_head(&iclog->ic_write_wait); 1440 init_waitqueue_head(&iclog->ic_write_wait);
1428 1441
@@ -1631,6 +1644,12 @@ xlog_cksum(
1631 * we transition the iclogs to IOERROR state *after* flushing all existing 1644 * we transition the iclogs to IOERROR state *after* flushing all existing
1632 * iclogs to disk. This is because we don't want anymore new transactions to be 1645 * iclogs to disk. This is because we don't want anymore new transactions to be
1633 * started or completed afterwards. 1646 * started or completed afterwards.
1647 *
1648 * We lock the iclogbufs here so that we can serialise against IO completion
1649 * during unmount. We might be processing a shutdown triggered during unmount,
1650 * and that can occur asynchronously to the unmount thread, and hence we need to
1651 * ensure that completes before tearing down the iclogbufs. Hence we need to
1652 * hold the buffer lock across the log IO to acheive that.
1634 */ 1653 */
1635STATIC int 1654STATIC int
1636xlog_bdstrat( 1655xlog_bdstrat(
@@ -1638,6 +1657,7 @@ xlog_bdstrat(
1638{ 1657{
1639 struct xlog_in_core *iclog = bp->b_fspriv; 1658 struct xlog_in_core *iclog = bp->b_fspriv;
1640 1659
1660 xfs_buf_lock(bp);
1641 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1661 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1642 xfs_buf_ioerror(bp, EIO); 1662 xfs_buf_ioerror(bp, EIO);
1643 xfs_buf_stale(bp); 1663 xfs_buf_stale(bp);
@@ -1645,7 +1665,8 @@ xlog_bdstrat(
1645 /* 1665 /*
1646 * It would seem logical to return EIO here, but we rely on 1666 * It would seem logical to return EIO here, but we rely on
1647 * the log state machine to propagate I/O errors instead of 1667 * the log state machine to propagate I/O errors instead of
1648 * doing it here. 1668 * doing it here. Similarly, IO completion will unlock the
1669 * buffer, so we don't do it here.
1649 */ 1670 */
1650 return 0; 1671 return 0;
1651 } 1672 }
@@ -1847,14 +1868,28 @@ xlog_dealloc_log(
1847 xlog_cil_destroy(log); 1868 xlog_cil_destroy(log);
1848 1869
1849 /* 1870 /*
1850 * always need to ensure that the extra buffer does not point to memory 1871 * Cycle all the iclogbuf locks to make sure all log IO completion
1851 * owned by another log buffer before we free it. 1872 * is done before we tear down these buffers.
1852 */ 1873 */
1874 iclog = log->l_iclog;
1875 for (i = 0; i < log->l_iclog_bufs; i++) {
1876 xfs_buf_lock(iclog->ic_bp);
1877 xfs_buf_unlock(iclog->ic_bp);
1878 iclog = iclog->ic_next;
1879 }
1880
1881 /*
1882 * Always need to ensure that the extra buffer does not point to memory
1883 * owned by another log buffer before we free it. Also, cycle the lock
1884 * first to ensure we've completed IO on it.
1885 */
1886 xfs_buf_lock(log->l_xbuf);
1887 xfs_buf_unlock(log->l_xbuf);
1853 xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); 1888 xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
1854 xfs_buf_free(log->l_xbuf); 1889 xfs_buf_free(log->l_xbuf);
1855 1890
1856 iclog = log->l_iclog; 1891 iclog = log->l_iclog;
1857 for (i=0; i<log->l_iclog_bufs; i++) { 1892 for (i = 0; i < log->l_iclog_bufs; i++) {
1858 xfs_buf_free(iclog->ic_bp); 1893 xfs_buf_free(iclog->ic_bp);
1859 next_iclog = iclog->ic_next; 1894 next_iclog = iclog->ic_next;
1860 kmem_free(iclog); 1895 kmem_free(iclog);