aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2013-06-04 22:09:07 -0400
committerBen Myers <bpm@sgi.com>2013-06-06 11:51:07 -0400
commit75406170751b4de88a01f73dda56efa617ddd5d7 (patch)
treeb9eac9caa2d9dc318e0ac71cd3bf8f9556db0d5f /fs
parentea929536a43226a01d1a73ac8b14d52e81163bd4 (diff)
xfs: fix log recovery transaction item reordering
There are several constraints that inode allocation and unlink logging impose on log recovery. These all stem from the fact that inode alloc/unlink are logged in buffers, but all other inode changes are logged in inode items. Hence there are ordering constraints that recovery must follow to ensure the correct result occurs. As it turns out, this ordering has been working mostly by chance than good management. The existing code moves all buffers except cancelled buffers to the head of the list, and everything else to the tail of the list. The problem with this is that is interleaves inode items with the buffer cancellation items, and hence whether the inode item in an cancelled buffer gets replayed is essentially left to chance. Further, this ordering causes problems for log recovery when inode CRCs are enabled. It typically replays the inode unlink buffer long before it replays the inode core changes, and so the CRC recorded in an unlink buffer is going to be invalid and hence any attempt to validate the inode in the buffer is going to fail. Hence we really need to enforce the ordering that the inode alloc/unlink code has expected log recovery to have since inode chunk de-allocation was introduced back in 2003... Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com> (cherry picked from commit a775ad778073d55744ed6709ccede36310638911)
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/xfs_log_recover.c65
1 files changed, 58 insertions, 7 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index d6204d1ac47f..83088d96e6c4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1599,10 +1599,43 @@ xlog_recover_add_to_trans(
1599} 1599}
1600 1600
1601/* 1601/*
1602 * Sort the log items in the transaction. Cancelled buffers need 1602 * Sort the log items in the transaction.
1603 * to be put first so they are processed before any items that might 1603 *
1604 * modify the buffers. If they are cancelled, then the modifications 1604 * The ordering constraints are defined by the inode allocation and unlink
1605 * don't need to be replayed. 1605 * behaviour. The rules are:
1606 *
1607 * 1. Every item is only logged once in a given transaction. Hence it
1608 * represents the last logged state of the item. Hence ordering is
1609 * dependent on the order in which operations need to be performed so
1610 * required initial conditions are always met.
1611 *
1612 * 2. Cancelled buffers are recorded in pass 1 in a separate table and
1613 * there's nothing to replay from them so we can simply cull them
1614 * from the transaction. However, we can't do that until after we've
1615 * replayed all the other items because they may be dependent on the
1616 * cancelled buffer and replaying the cancelled buffer can remove it
1617 * form the cancelled buffer table. Hence they have tobe done last.
1618 *
1619 * 3. Inode allocation buffers must be replayed before inode items that
1620 * read the buffer and replay changes into it.
1621 *
1622 * 4. Inode unlink buffers must be replayed after inode items are replayed.
1623 * This ensures that inodes are completely flushed to the inode buffer
1624 * in a "free" state before we remove the unlinked inode list pointer.
1625 *
1626 * Hence the ordering needs to be inode allocation buffers first, inode items
1627 * second, inode unlink buffers third and cancelled buffers last.
1628 *
1629 * But there's a problem with that - we can't tell an inode allocation buffer
1630 * apart from a regular buffer, so we can't separate them. We can, however,
1631 * tell an inode unlink buffer from the others, and so we can separate them out
1632 * from all the other buffers and move them to last.
1633 *
1634 * Hence, 4 lists, in order from head to tail:
1635 * - buffer_list for all buffers except cancelled/inode unlink buffers
1636 * - item_list for all non-buffer items
1637 * - inode_buffer_list for inode unlink buffers
1638 * - cancel_list for the cancelled buffers
1606 */ 1639 */
1607STATIC int 1640STATIC int
1608xlog_recover_reorder_trans( 1641xlog_recover_reorder_trans(
@@ -1612,6 +1645,10 @@ xlog_recover_reorder_trans(
1612{ 1645{
1613 xlog_recover_item_t *item, *n; 1646 xlog_recover_item_t *item, *n;
1614 LIST_HEAD(sort_list); 1647 LIST_HEAD(sort_list);
1648 LIST_HEAD(cancel_list);
1649 LIST_HEAD(buffer_list);
1650 LIST_HEAD(inode_buffer_list);
1651 LIST_HEAD(inode_list);
1615 1652
1616 list_splice_init(&trans->r_itemq, &sort_list); 1653 list_splice_init(&trans->r_itemq, &sort_list);
1617 list_for_each_entry_safe(item, n, &sort_list, ri_list) { 1654 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
@@ -1619,12 +1656,18 @@ xlog_recover_reorder_trans(
1619 1656
1620 switch (ITEM_TYPE(item)) { 1657 switch (ITEM_TYPE(item)) {
1621 case XFS_LI_BUF: 1658 case XFS_LI_BUF:
1622 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { 1659 if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1623 trace_xfs_log_recover_item_reorder_head(log, 1660 trace_xfs_log_recover_item_reorder_head(log,
1624 trans, item, pass); 1661 trans, item, pass);
1625 list_move(&item->ri_list, &trans->r_itemq); 1662 list_move(&item->ri_list, &cancel_list);
1626 break; 1663 break;
1627 } 1664 }
1665 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1666 list_move(&item->ri_list, &inode_buffer_list);
1667 break;
1668 }
1669 list_move_tail(&item->ri_list, &buffer_list);
1670 break;
1628 case XFS_LI_INODE: 1671 case XFS_LI_INODE:
1629 case XFS_LI_DQUOT: 1672 case XFS_LI_DQUOT:
1630 case XFS_LI_QUOTAOFF: 1673 case XFS_LI_QUOTAOFF:
@@ -1632,7 +1675,7 @@ xlog_recover_reorder_trans(
1632 case XFS_LI_EFI: 1675 case XFS_LI_EFI:
1633 trace_xfs_log_recover_item_reorder_tail(log, 1676 trace_xfs_log_recover_item_reorder_tail(log,
1634 trans, item, pass); 1677 trans, item, pass);
1635 list_move_tail(&item->ri_list, &trans->r_itemq); 1678 list_move_tail(&item->ri_list, &inode_list);
1636 break; 1679 break;
1637 default: 1680 default:
1638 xfs_warn(log->l_mp, 1681 xfs_warn(log->l_mp,
@@ -1643,6 +1686,14 @@ xlog_recover_reorder_trans(
1643 } 1686 }
1644 } 1687 }
1645 ASSERT(list_empty(&sort_list)); 1688 ASSERT(list_empty(&sort_list));
1689 if (!list_empty(&buffer_list))
1690 list_splice(&buffer_list, &trans->r_itemq);
1691 if (!list_empty(&inode_list))
1692 list_splice_tail(&inode_list, &trans->r_itemq);
1693 if (!list_empty(&inode_buffer_list))
1694 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1695 if (!list_empty(&cancel_list))
1696 list_splice_tail(&cancel_list, &trans->r_itemq);
1646 return 0; 1697 return 0;
1647} 1698}
1648 1699