diff options
author | Dave Chinner <dchinner@redhat.com> | 2010-05-20 09:19:42 -0400 |
---|---|---|
committer | Alex Elder <aelder@sgi.com> | 2010-05-24 11:41:22 -0400 |
commit | ccf7c23fc129e75ef60e6f59f60a485b7a056598 (patch) | |
tree | 957539e31ee2a7155bbf9bb085ec1cb1d3432d3a /fs | |
parent | df806158b0f6eb24247773b4a19b8b59d7217e59 (diff) |
xfs: Ensure inode allocation buffers are fully replayed
With delayed logging, we can get inode allocation buffers in the
same transaction inode unlink buffers. We don't currently mark inode
allocation buffers in the log, so inode unlink buffers take
precedence over allocation buffers.
The result is that when they are combined into the same checkpoint,
only the unlinked inode chain fields are replayed, resulting in
uninitialised inode buffers being detected when the next inode
modification is replayed.
To fix this, we need to ensure that we do not set the inode buffer
flag in the buffer log item format flags if the inode allocation has
not already hit the log. To avoid requiring a change to log
recovery, we really need to make this a modification that relies
only on in-memory sate.
We can do this by checking during buffer log formatting (while the
CIL cannot be flushed) if we are still in the same sequence when we
commit the unlink transaction as the inode allocation transaction.
If we are, then we do not add the inode buffer flag to the buffer
log format item flags. This means the entire buffer will be
replayed, not just the unlinked fields. We do this while
CIL flusheѕ are locked out to ensure that we don't race with the
sequence numbers changing and hence fail to put the inode buffer
flag in the buffer format flags when we really need to.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/xfs/xfs_buf_item.c | 14 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.h | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_log.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 45 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_buf.c | 20 |
6 files changed, 74 insertions, 11 deletions
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index bcbb66150838..02a80984aa05 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c | |||
@@ -254,6 +254,20 @@ xfs_buf_item_format( | |||
254 | vecp++; | 254 | vecp++; |
255 | nvecs = 1; | 255 | nvecs = 1; |
256 | 256 | ||
257 | /* | ||
258 | * If it is an inode buffer, transfer the in-memory state to the | ||
259 | * format flags and clear the in-memory state. We do not transfer | ||
260 | * this state if the inode buffer allocation has not yet been committed | ||
261 | * to the log as setting the XFS_BLI_INODE_BUF flag will prevent | ||
262 | * correct replay of the inode allocation. | ||
263 | */ | ||
264 | if (bip->bli_flags & XFS_BLI_INODE_BUF) { | ||
265 | if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && | ||
266 | xfs_log_item_in_current_chkpt(&bip->bli_item))) | ||
267 | bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; | ||
268 | bip->bli_flags &= ~XFS_BLI_INODE_BUF; | ||
269 | } | ||
270 | |||
257 | if (bip->bli_flags & XFS_BLI_STALE) { | 271 | if (bip->bli_flags & XFS_BLI_STALE) { |
258 | /* | 272 | /* |
259 | * The buffer is stale, so all we need to log | 273 | * The buffer is stale, so all we need to log |
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 8cbb82b1d95c..f20bb472d582 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h | |||
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format { | |||
69 | #define XFS_BLI_LOGGED 0x08 | 69 | #define XFS_BLI_LOGGED 0x08 |
70 | #define XFS_BLI_INODE_ALLOC_BUF 0x10 | 70 | #define XFS_BLI_INODE_ALLOC_BUF 0x10 |
71 | #define XFS_BLI_STALE_INODE 0x20 | 71 | #define XFS_BLI_STALE_INODE 0x20 |
72 | #define XFS_BLI_INODE_BUF 0x40 | ||
72 | 73 | ||
73 | #define XFS_BLI_FLAGS \ | 74 | #define XFS_BLI_FLAGS \ |
74 | { XFS_BLI_HOLD, "HOLD" }, \ | 75 | { XFS_BLI_HOLD, "HOLD" }, \ |
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format { | |||
76 | { XFS_BLI_STALE, "STALE" }, \ | 77 | { XFS_BLI_STALE, "STALE" }, \ |
77 | { XFS_BLI_LOGGED, "LOGGED" }, \ | 78 | { XFS_BLI_LOGGED, "LOGGED" }, \ |
78 | { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ | 79 | { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ |
79 | { XFS_BLI_STALE_INODE, "STALE_INODE" } | 80 | { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ |
81 | { XFS_BLI_INODE_BUF, "INODE_BUF" } | ||
80 | 82 | ||
81 | 83 | ||
82 | #ifdef __KERNEL__ | 84 | #ifdef __KERNEL__ |
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 4a0c57432e8f..04c78e642cc8 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h | |||
@@ -198,6 +198,7 @@ xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); | |||
198 | int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, | 198 | int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, |
199 | struct xfs_log_vec *log_vector, | 199 | struct xfs_log_vec *log_vector, |
200 | xfs_lsn_t *commit_lsn, int flags); | 200 | xfs_lsn_t *commit_lsn, int flags); |
201 | bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); | ||
201 | 202 | ||
202 | #endif | 203 | #endif |
203 | 204 | ||
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9b21f80f31ce..bb17cc044bf3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c | |||
@@ -200,6 +200,15 @@ xlog_cil_insert( | |||
200 | ctx->nvecs += diff_iovecs; | 200 | ctx->nvecs += diff_iovecs; |
201 | 201 | ||
202 | /* | 202 | /* |
203 | * If this is the first time the item is being committed to the CIL, | ||
204 | * store the sequence number on the log item so we can tell | ||
205 | * in future commits whether this is the first checkpoint the item is | ||
206 | * being committed into. | ||
207 | */ | ||
208 | if (!item->li_seq) | ||
209 | item->li_seq = ctx->sequence; | ||
210 | |||
211 | /* | ||
203 | * Now transfer enough transaction reservation to the context ticket | 212 | * Now transfer enough transaction reservation to the context ticket |
204 | * for the checkpoint. The context ticket is special - the unit | 213 | * for the checkpoint. The context ticket is special - the unit |
205 | * reservation has to grow as well as the current reservation as we | 214 | * reservation has to grow as well as the current reservation as we |
@@ -325,6 +334,10 @@ xlog_cil_free_logvec( | |||
325 | * For more specific information about the order of operations in | 334 | * For more specific information about the order of operations in |
326 | * xfs_log_commit_cil() please refer to the comments in | 335 | * xfs_log_commit_cil() please refer to the comments in |
327 | * xfs_trans_commit_iclog(). | 336 | * xfs_trans_commit_iclog(). |
337 | * | ||
338 | * Called with the context lock already held in read mode to lock out | ||
339 | * background commit, returns without it held once background commits are | ||
340 | * allowed again. | ||
328 | */ | 341 | */ |
329 | int | 342 | int |
330 | xfs_log_commit_cil( | 343 | xfs_log_commit_cil( |
@@ -678,3 +691,35 @@ restart: | |||
678 | spin_unlock(&cil->xc_cil_lock); | 691 | spin_unlock(&cil->xc_cil_lock); |
679 | return commit_lsn; | 692 | return commit_lsn; |
680 | } | 693 | } |
694 | |||
695 | /* | ||
696 | * Check if the current log item was first committed in this sequence. | ||
697 | * We can't rely on just the log item being in the CIL, we have to check | ||
698 | * the recorded commit sequence number. | ||
699 | * | ||
700 | * Note: for this to be used in a non-racy manner, it has to be called with | ||
701 | * CIL flushing locked out. As a result, it should only be used during the | ||
702 | * transaction commit process when deciding what to format into the item. | ||
703 | */ | ||
704 | bool | ||
705 | xfs_log_item_in_current_chkpt( | ||
706 | struct xfs_log_item *lip) | ||
707 | { | ||
708 | struct xfs_cil_ctx *ctx; | ||
709 | |||
710 | if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) | ||
711 | return false; | ||
712 | if (list_empty(&lip->li_cil)) | ||
713 | return false; | ||
714 | |||
715 | ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; | ||
716 | |||
717 | /* | ||
718 | * li_seq is written on the first commit of a log item to record the | ||
719 | * first checkpoint it is written to. Hence if it is different to the | ||
720 | * current sequence, we're in a new checkpoint. | ||
721 | */ | ||
722 | if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) | ||
723 | return false; | ||
724 | return true; | ||
725 | } | ||
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b1ea20c66b3e..8c69e7824f68 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h | |||
@@ -835,6 +835,7 @@ typedef struct xfs_log_item { | |||
835 | /* delayed logging */ | 835 | /* delayed logging */ |
836 | struct list_head li_cil; /* CIL pointers */ | 836 | struct list_head li_cil; /* CIL pointers */ |
837 | struct xfs_log_vec *li_lv; /* active log vector */ | 837 | struct xfs_log_vec *li_lv; /* active log vector */ |
838 | xfs_lsn_t li_seq; /* CIL commit seq */ | ||
838 | } xfs_log_item_t; | 839 | } xfs_log_item_t; |
839 | 840 | ||
840 | #define XFS_LI_IN_AIL 0x1 | 841 | #define XFS_LI_IN_AIL 0x1 |
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3390c3e7441b..63d81a22f4fd 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c | |||
@@ -792,7 +792,7 @@ xfs_trans_binval( | |||
792 | XFS_BUF_UNDELAYWRITE(bp); | 792 | XFS_BUF_UNDELAYWRITE(bp); |
793 | XFS_BUF_STALE(bp); | 793 | XFS_BUF_STALE(bp); |
794 | bip->bli_flags |= XFS_BLI_STALE; | 794 | bip->bli_flags |= XFS_BLI_STALE; |
795 | bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); | 795 | bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); |
796 | bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; | 796 | bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; |
797 | bip->bli_format.blf_flags |= XFS_BLF_CANCEL; | 797 | bip->bli_format.blf_flags |= XFS_BLF_CANCEL; |
798 | memset((char *)(bip->bli_format.blf_data_map), 0, | 798 | memset((char *)(bip->bli_format.blf_data_map), 0, |
@@ -802,16 +802,16 @@ xfs_trans_binval( | |||
802 | } | 802 | } |
803 | 803 | ||
804 | /* | 804 | /* |
805 | * This call is used to indicate that the buffer contains on-disk | 805 | * This call is used to indicate that the buffer contains on-disk inodes which |
806 | * inodes which must be handled specially during recovery. They | 806 | * must be handled specially during recovery. They require special handling |
807 | * require special handling because only the di_next_unlinked from | 807 | * because only the di_next_unlinked from the inodes in the buffer should be |
808 | * the inodes in the buffer should be recovered. The rest of the | 808 | * recovered. The rest of the data in the buffer is logged via the inodes |
809 | * data in the buffer is logged via the inodes themselves. | 809 | * themselves. |
810 | * | 810 | * |
811 | * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log | 811 | * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be |
812 | * format structure so that we'll know what to do at recovery time. | 812 | * transferred to the buffer's log format structure so that we'll know what to |
813 | * do at recovery time. | ||
813 | */ | 814 | */ |
814 | /* ARGSUSED */ | ||
815 | void | 815 | void |
816 | xfs_trans_inode_buf( | 816 | xfs_trans_inode_buf( |
817 | xfs_trans_t *tp, | 817 | xfs_trans_t *tp, |
@@ -826,7 +826,7 @@ xfs_trans_inode_buf( | |||
826 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); | 826 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); |
827 | ASSERT(atomic_read(&bip->bli_refcount) > 0); | 827 | ASSERT(atomic_read(&bip->bli_refcount) > 0); |
828 | 828 | ||
829 | bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; | 829 | bip->bli_flags |= XFS_BLI_INODE_BUF; |
830 | } | 830 | } |
831 | 831 | ||
832 | /* | 832 | /* |