aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2017-11-21 23:53:02 -0500
committerDarrick J. Wong <darrick.wong@oracle.com>2017-11-27 12:34:08 -0500
commit509955823cc9cc225c05673b1b83d70ca70c5c60 (patch)
tree2f8bbd973140712bbb026d3af3afffe526b2a4dc
parent98c4f78dcdd8cec112d1cbc5e9a792ee6e5ab7a6 (diff)
xfs: log recovery should replay deferred ops in order
As part of testing log recovery with dm_log_writes, Amir Goldstein discovered an error in the deferred ops recovery that lead to corruption of the filesystem metadata if a reflink+rmap filesystem happened to shut down midway through a CoW remap: "This is what happens [after failed log recovery]: "Phase 1 - find and verify superblock... "Phase 2 - using internal log " - zero log... " - scan filesystem freespace and inode maps... " - found root inode chunk "Phase 3 - for each AG... " - scan (but don't clear) agi unlinked lists... " - process known inodes and perform inode discovery... " - agno = 0 "data fork in regular inode 134 claims CoW block 376 "correcting nextents for inode 134 "bad data fork in inode 134 "would have cleared inode 134" Hou Tao dissected the log contents of exactly such a crash: "According to the implementation of xfs_defer_finish(), these ops should be completed in the following sequence: "Have been done: "(1) CUI: Oper (160) "(2) BUI: Oper (161) "(3) CUD: Oper (194), for CUI Oper (160) "(4) RUI A: Oper (197), free rmap [0x155, 2, -9] "Should be done: "(5) BUD: for BUI Oper (161) "(6) RUI B: add rmap [0x155, 2, 137] "(7) RUD: for RUI A "(8) RUD: for RUI B "Actually be done by xlog_recover_process_intents() "(5) BUD: for BUI Oper (161) "(6) RUI B: add rmap [0x155, 2, 137] "(7) RUD: for RUI B "(8) RUD: for RUI A "So the rmap entry [0x155, 2, -9] for COW should be freed firstly, then a new rmap entry [0x155, 2, 137] will be added. However, as we can see from the log record in post_mount.log (generated after umount) and the trace print, the new rmap entry [0x155, 2, 137] are added firstly, then the rmap entry [0x155, 2, -9] are freed." When reconstructing the internal log state from the log items found on disk, it's required that deferred ops replay in exactly the same order that they would have had the filesystem not gone down. However, replaying unfinished deferred ops can create /more/ deferred ops. These new deferred ops are finished in the wrong order. This causes fs corruption and replay crashes, so let's create a single defer_ops to handle the subsequent ops created during replay, then use one single transaction at the end of log recovery to ensure that everything is replayed in the same order as they're supposed to be. Reported-by: Amir Goldstein <amir73il@gmail.com> Analyzed-by: Hou Tao <houtao1@huawei.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Tested-by: Amir Goldstein <amir73il@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
-rw-r--r--fs/xfs/xfs_bmap_item.c23
-rw-r--r--fs/xfs/xfs_bmap_item.h3
-rw-r--r--fs/xfs/xfs_log_recover.c75
-rw-r--r--fs/xfs/xfs_refcount_item.c21
-rw-r--r--fs/xfs/xfs_refcount_item.h3
5 files changed, 85 insertions, 40 deletions
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index dd136f7275e4..e5fb008d75e8 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -389,7 +389,8 @@ xfs_bud_init(
389int 389int
390xfs_bui_recover( 390xfs_bui_recover(
391 struct xfs_mount *mp, 391 struct xfs_mount *mp,
392 struct xfs_bui_log_item *buip) 392 struct xfs_bui_log_item *buip,
393 struct xfs_defer_ops *dfops)
393{ 394{
394 int error = 0; 395 int error = 0;
395 unsigned int bui_type; 396 unsigned int bui_type;
@@ -404,9 +405,7 @@ xfs_bui_recover(
404 xfs_exntst_t state; 405 xfs_exntst_t state;
405 struct xfs_trans *tp; 406 struct xfs_trans *tp;
406 struct xfs_inode *ip = NULL; 407 struct xfs_inode *ip = NULL;
407 struct xfs_defer_ops dfops;
408 struct xfs_bmbt_irec irec; 408 struct xfs_bmbt_irec irec;
409 xfs_fsblock_t firstfsb;
410 409
411 ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); 410 ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
412 411
@@ -464,7 +463,6 @@ xfs_bui_recover(
464 463
465 if (VFS_I(ip)->i_nlink == 0) 464 if (VFS_I(ip)->i_nlink == 0)
466 xfs_iflags_set(ip, XFS_IRECOVERY); 465 xfs_iflags_set(ip, XFS_IRECOVERY);
467 xfs_defer_init(&dfops, &firstfsb);
468 466
469 /* Process deferred bmap item. */ 467 /* Process deferred bmap item. */
470 state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? 468 state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
@@ -479,16 +477,16 @@ xfs_bui_recover(
479 break; 477 break;
480 default: 478 default:
481 error = -EFSCORRUPTED; 479 error = -EFSCORRUPTED;
482 goto err_dfops; 480 goto err_inode;
483 } 481 }
484 xfs_trans_ijoin(tp, ip, 0); 482 xfs_trans_ijoin(tp, ip, 0);
485 483
486 count = bmap->me_len; 484 count = bmap->me_len;
487 error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, 485 error = xfs_trans_log_finish_bmap_update(tp, budp, dfops, type,
488 ip, whichfork, bmap->me_startoff, 486 ip, whichfork, bmap->me_startoff,
489 bmap->me_startblock, &count, state); 487 bmap->me_startblock, &count, state);
490 if (error) 488 if (error)
491 goto err_dfops; 489 goto err_inode;
492 490
493 if (count > 0) { 491 if (count > 0) {
494 ASSERT(type == XFS_BMAP_UNMAP); 492 ASSERT(type == XFS_BMAP_UNMAP);
@@ -496,16 +494,11 @@ xfs_bui_recover(
496 irec.br_blockcount = count; 494 irec.br_blockcount = count;
497 irec.br_startoff = bmap->me_startoff; 495 irec.br_startoff = bmap->me_startoff;
498 irec.br_state = state; 496 irec.br_state = state;
499 error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); 497 error = xfs_bmap_unmap_extent(tp->t_mountp, dfops, ip, &irec);
500 if (error) 498 if (error)
501 goto err_dfops; 499 goto err_inode;
502 } 500 }
503 501
504 /* Finish transaction, free inodes. */
505 error = xfs_defer_finish(&tp, &dfops);
506 if (error)
507 goto err_dfops;
508
509 set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); 502 set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
510 error = xfs_trans_commit(tp); 503 error = xfs_trans_commit(tp);
511 xfs_iunlock(ip, XFS_ILOCK_EXCL); 504 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -513,8 +506,6 @@ xfs_bui_recover(
513 506
514 return error; 507 return error;
515 508
516err_dfops:
517 xfs_defer_cancel(&dfops);
518err_inode: 509err_inode:
519 xfs_trans_cancel(tp); 510 xfs_trans_cancel(tp);
520 if (ip) { 511 if (ip) {
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index c867daae4a3c..24b354a2c836 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -93,6 +93,7 @@ struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
93 struct xfs_bui_log_item *); 93 struct xfs_bui_log_item *);
94void xfs_bui_item_free(struct xfs_bui_log_item *); 94void xfs_bui_item_free(struct xfs_bui_log_item *);
95void xfs_bui_release(struct xfs_bui_log_item *); 95void xfs_bui_release(struct xfs_bui_log_item *);
96int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); 96int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip,
97 struct xfs_defer_ops *dfops);
97 98
98#endif /* __XFS_BMAP_ITEM_H__ */ 99#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 87b1c331f9eb..28d1abfe835e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,6 +24,7 @@
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_mount.h" 26#include "xfs_mount.h"
27#include "xfs_defer.h"
27#include "xfs_da_format.h" 28#include "xfs_da_format.h"
28#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
29#include "xfs_inode.h" 30#include "xfs_inode.h"
@@ -4716,7 +4717,8 @@ STATIC int
4716xlog_recover_process_cui( 4717xlog_recover_process_cui(
4717 struct xfs_mount *mp, 4718 struct xfs_mount *mp,
4718 struct xfs_ail *ailp, 4719 struct xfs_ail *ailp,
4719 struct xfs_log_item *lip) 4720 struct xfs_log_item *lip,
4721 struct xfs_defer_ops *dfops)
4720{ 4722{
4721 struct xfs_cui_log_item *cuip; 4723 struct xfs_cui_log_item *cuip;
4722 int error; 4724 int error;
@@ -4729,7 +4731,7 @@ xlog_recover_process_cui(
4729 return 0; 4731 return 0;
4730 4732
4731 spin_unlock(&ailp->xa_lock); 4733 spin_unlock(&ailp->xa_lock);
4732 error = xfs_cui_recover(mp, cuip); 4734 error = xfs_cui_recover(mp, cuip, dfops);
4733 spin_lock(&ailp->xa_lock); 4735 spin_lock(&ailp->xa_lock);
4734 4736
4735 return error; 4737 return error;
@@ -4756,7 +4758,8 @@ STATIC int
4756xlog_recover_process_bui( 4758xlog_recover_process_bui(
4757 struct xfs_mount *mp, 4759 struct xfs_mount *mp,
4758 struct xfs_ail *ailp, 4760 struct xfs_ail *ailp,
4759 struct xfs_log_item *lip) 4761 struct xfs_log_item *lip,
4762 struct xfs_defer_ops *dfops)
4760{ 4763{
4761 struct xfs_bui_log_item *buip; 4764 struct xfs_bui_log_item *buip;
4762 int error; 4765 int error;
@@ -4769,7 +4772,7 @@ xlog_recover_process_bui(
4769 return 0; 4772 return 0;
4770 4773
4771 spin_unlock(&ailp->xa_lock); 4774 spin_unlock(&ailp->xa_lock);
4772 error = xfs_bui_recover(mp, buip); 4775 error = xfs_bui_recover(mp, buip, dfops);
4773 spin_lock(&ailp->xa_lock); 4776 spin_lock(&ailp->xa_lock);
4774 4777
4775 return error; 4778 return error;
@@ -4805,6 +4808,46 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
4805 } 4808 }
4806} 4809}
4807 4810
4811/* Take all the collected deferred ops and finish them in order. */
4812static int
4813xlog_finish_defer_ops(
4814 struct xfs_mount *mp,
4815 struct xfs_defer_ops *dfops)
4816{
4817 struct xfs_trans *tp;
4818 int64_t freeblks;
4819 uint resblks;
4820 int error;
4821
4822 /*
4823 * We're finishing the defer_ops that accumulated as a result of
4824 * recovering unfinished intent items during log recovery. We
4825 * reserve an itruncate transaction because it is the largest
4826 * permanent transaction type. Since we're the only user of the fs
4827 * right now, take 93% (15/16) of the available free blocks. Use
4828 * weird math to avoid a 64-bit division.
4829 */
4830 freeblks = percpu_counter_sum(&mp->m_fdblocks);
4831 if (freeblks <= 0)
4832 return -ENOSPC;
4833 resblks = min_t(int64_t, UINT_MAX, freeblks);
4834 resblks = (resblks * 15) >> 4;
4835 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
4836 0, XFS_TRANS_RESERVE, &tp);
4837 if (error)
4838 return error;
4839
4840 error = xfs_defer_finish(&tp, dfops);
4841 if (error)
4842 goto out_cancel;
4843
4844 return xfs_trans_commit(tp);
4845
4846out_cancel:
4847 xfs_trans_cancel(tp);
4848 return error;
4849}
4850
4808/* 4851/*
4809 * When this is called, all of the log intent items which did not have 4852 * When this is called, all of the log intent items which did not have
4810 * corresponding log done items should be in the AIL. What we do now 4853 * corresponding log done items should be in the AIL. What we do now
@@ -4825,10 +4868,12 @@ STATIC int
4825xlog_recover_process_intents( 4868xlog_recover_process_intents(
4826 struct xlog *log) 4869 struct xlog *log)
4827{ 4870{
4828 struct xfs_log_item *lip; 4871 struct xfs_defer_ops dfops;
4829 int error = 0;
4830 struct xfs_ail_cursor cur; 4872 struct xfs_ail_cursor cur;
4873 struct xfs_log_item *lip;
4831 struct xfs_ail *ailp; 4874 struct xfs_ail *ailp;
4875 xfs_fsblock_t firstfsb;
4876 int error = 0;
4832#if defined(DEBUG) || defined(XFS_WARN) 4877#if defined(DEBUG) || defined(XFS_WARN)
4833 xfs_lsn_t last_lsn; 4878 xfs_lsn_t last_lsn;
4834#endif 4879#endif
@@ -4839,6 +4884,7 @@ xlog_recover_process_intents(
4839#if defined(DEBUG) || defined(XFS_WARN) 4884#if defined(DEBUG) || defined(XFS_WARN)
4840 last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); 4885 last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
4841#endif 4886#endif
4887 xfs_defer_init(&dfops, &firstfsb);
4842 while (lip != NULL) { 4888 while (lip != NULL) {
4843 /* 4889 /*
4844 * We're done when we see something other than an intent. 4890 * We're done when we see something other than an intent.
@@ -4859,6 +4905,12 @@ xlog_recover_process_intents(
4859 */ 4905 */
4860 ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); 4906 ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
4861 4907
4908 /*
4909 * NOTE: If your intent processing routine can create more
4910 * deferred ops, you /must/ attach them to the dfops in this
4911 * routine or else those subsequent intents will get
4912 * replayed in the wrong order!
4913 */
4862 switch (lip->li_type) { 4914 switch (lip->li_type) {
4863 case XFS_LI_EFI: 4915 case XFS_LI_EFI:
4864 error = xlog_recover_process_efi(log->l_mp, ailp, lip); 4916 error = xlog_recover_process_efi(log->l_mp, ailp, lip);
@@ -4867,10 +4919,12 @@ xlog_recover_process_intents(
4867 error = xlog_recover_process_rui(log->l_mp, ailp, lip); 4919 error = xlog_recover_process_rui(log->l_mp, ailp, lip);
4868 break; 4920 break;
4869 case XFS_LI_CUI: 4921 case XFS_LI_CUI:
4870 error = xlog_recover_process_cui(log->l_mp, ailp, lip); 4922 error = xlog_recover_process_cui(log->l_mp, ailp, lip,
4923 &dfops);
4871 break; 4924 break;
4872 case XFS_LI_BUI: 4925 case XFS_LI_BUI:
4873 error = xlog_recover_process_bui(log->l_mp, ailp, lip); 4926 error = xlog_recover_process_bui(log->l_mp, ailp, lip,
4927 &dfops);
4874 break; 4928 break;
4875 } 4929 }
4876 if (error) 4930 if (error)
@@ -4880,6 +4934,11 @@ xlog_recover_process_intents(
4880out: 4934out:
4881 xfs_trans_ail_cursor_done(&cur); 4935 xfs_trans_ail_cursor_done(&cur);
4882 spin_unlock(&ailp->xa_lock); 4936 spin_unlock(&ailp->xa_lock);
4937 if (error)
4938 xfs_defer_cancel(&dfops);
4939 else
4940 error = xlog_finish_defer_ops(log->l_mp, &dfops);
4941
4883 return error; 4942 return error;
4884} 4943}
4885 4944
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 8f2e2fac4255..3a55d6fc271b 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -393,7 +393,8 @@ xfs_cud_init(
393int 393int
394xfs_cui_recover( 394xfs_cui_recover(
395 struct xfs_mount *mp, 395 struct xfs_mount *mp,
396 struct xfs_cui_log_item *cuip) 396 struct xfs_cui_log_item *cuip,
397 struct xfs_defer_ops *dfops)
397{ 398{
398 int i; 399 int i;
399 int error = 0; 400 int error = 0;
@@ -405,11 +406,9 @@ xfs_cui_recover(
405 struct xfs_trans *tp; 406 struct xfs_trans *tp;
406 struct xfs_btree_cur *rcur = NULL; 407 struct xfs_btree_cur *rcur = NULL;
407 enum xfs_refcount_intent_type type; 408 enum xfs_refcount_intent_type type;
408 xfs_fsblock_t firstfsb;
409 xfs_fsblock_t new_fsb; 409 xfs_fsblock_t new_fsb;
410 xfs_extlen_t new_len; 410 xfs_extlen_t new_len;
411 struct xfs_bmbt_irec irec; 411 struct xfs_bmbt_irec irec;
412 struct xfs_defer_ops dfops;
413 bool requeue_only = false; 412 bool requeue_only = false;
414 413
415 ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); 414 ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
@@ -465,7 +464,6 @@ xfs_cui_recover(
465 return error; 464 return error;
466 cudp = xfs_trans_get_cud(tp, cuip); 465 cudp = xfs_trans_get_cud(tp, cuip);
467 466
468 xfs_defer_init(&dfops, &firstfsb);
469 for (i = 0; i < cuip->cui_format.cui_nextents; i++) { 467 for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
470 refc = &cuip->cui_format.cui_extents[i]; 468 refc = &cuip->cui_format.cui_extents[i];
471 refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; 469 refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
@@ -485,7 +483,7 @@ xfs_cui_recover(
485 new_len = refc->pe_len; 483 new_len = refc->pe_len;
486 } else 484 } else
487 error = xfs_trans_log_finish_refcount_update(tp, cudp, 485 error = xfs_trans_log_finish_refcount_update(tp, cudp,
488 &dfops, type, refc->pe_startblock, refc->pe_len, 486 dfops, type, refc->pe_startblock, refc->pe_len,
489 &new_fsb, &new_len, &rcur); 487 &new_fsb, &new_len, &rcur);
490 if (error) 488 if (error)
491 goto abort_error; 489 goto abort_error;
@@ -497,21 +495,21 @@ xfs_cui_recover(
497 switch (type) { 495 switch (type) {
498 case XFS_REFCOUNT_INCREASE: 496 case XFS_REFCOUNT_INCREASE:
499 error = xfs_refcount_increase_extent( 497 error = xfs_refcount_increase_extent(
500 tp->t_mountp, &dfops, &irec); 498 tp->t_mountp, dfops, &irec);
501 break; 499 break;
502 case XFS_REFCOUNT_DECREASE: 500 case XFS_REFCOUNT_DECREASE:
503 error = xfs_refcount_decrease_extent( 501 error = xfs_refcount_decrease_extent(
504 tp->t_mountp, &dfops, &irec); 502 tp->t_mountp, dfops, &irec);
505 break; 503 break;
506 case XFS_REFCOUNT_ALLOC_COW: 504 case XFS_REFCOUNT_ALLOC_COW:
507 error = xfs_refcount_alloc_cow_extent( 505 error = xfs_refcount_alloc_cow_extent(
508 tp->t_mountp, &dfops, 506 tp->t_mountp, dfops,
509 irec.br_startblock, 507 irec.br_startblock,
510 irec.br_blockcount); 508 irec.br_blockcount);
511 break; 509 break;
512 case XFS_REFCOUNT_FREE_COW: 510 case XFS_REFCOUNT_FREE_COW:
513 error = xfs_refcount_free_cow_extent( 511 error = xfs_refcount_free_cow_extent(
514 tp->t_mountp, &dfops, 512 tp->t_mountp, dfops,
515 irec.br_startblock, 513 irec.br_startblock,
516 irec.br_blockcount); 514 irec.br_blockcount);
517 break; 515 break;
@@ -525,17 +523,12 @@ xfs_cui_recover(
525 } 523 }
526 524
527 xfs_refcount_finish_one_cleanup(tp, rcur, error); 525 xfs_refcount_finish_one_cleanup(tp, rcur, error);
528 error = xfs_defer_finish(&tp, &dfops);
529 if (error)
530 goto abort_defer;
531 set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); 526 set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
532 error = xfs_trans_commit(tp); 527 error = xfs_trans_commit(tp);
533 return error; 528 return error;
534 529
535abort_error: 530abort_error:
536 xfs_refcount_finish_one_cleanup(tp, rcur, error); 531 xfs_refcount_finish_one_cleanup(tp, rcur, error);
537abort_defer:
538 xfs_defer_cancel(&dfops);
539 xfs_trans_cancel(tp); 532 xfs_trans_cancel(tp);
540 return error; 533 return error;
541} 534}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index 5b74dddfa64b..0e5327349a13 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -96,6 +96,7 @@ struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
96 struct xfs_cui_log_item *); 96 struct xfs_cui_log_item *);
97void xfs_cui_item_free(struct xfs_cui_log_item *); 97void xfs_cui_item_free(struct xfs_cui_log_item *);
98void xfs_cui_release(struct xfs_cui_log_item *); 98void xfs_cui_release(struct xfs_cui_log_item *);
99int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); 99int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip,
100 struct xfs_defer_ops *dfops);
100 101
101#endif /* __XFS_REFCOUNT_ITEM_H__ */ 102#endif /* __XFS_REFCOUNT_ITEM_H__ */