aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2010-05-20 09:19:42 -0400
committerAlex Elder <aelder@sgi.com>2010-05-24 11:41:22 -0400
commitccf7c23fc129e75ef60e6f59f60a485b7a056598 (patch)
tree957539e31ee2a7155bbf9bb085ec1cb1d3432d3a /fs/xfs
parentdf806158b0f6eb24247773b4a19b8b59d7217e59 (diff)
xfs: Ensure inode allocation buffers are fully replayed
With delayed logging, we can get inode allocation buffers in the same transaction inode unlink buffers. We don't currently mark inode allocation buffers in the log, so inode unlink buffers take precedence over allocation buffers. The result is that when they are combined into the same checkpoint, only the unlinked inode chain fields are replayed, resulting in uninitialised inode buffers being detected when the next inode modification is replayed. To fix this, we need to ensure that we do not set the inode buffer flag in the buffer log item format flags if the inode allocation has not already hit the log. To avoid requiring a change to log recovery, we really need to make this a modification that relies only on in-memory sate. We can do this by checking during buffer log formatting (while the CIL cannot be flushed) if we are still in the same sequence when we commit the unlink transaction as the inode allocation transaction. If we are, then we do not add the inode buffer flag to the buffer log format item flags. This means the entire buffer will be replayed, not just the unlinked fields. We do this while CIL flusheѕ are locked out to ensure that we don't race with the sequence numbers changing and hence fail to put the inode buffer flag in the buffer format flags when we really need to. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_buf_item.c14
-rw-r--r--fs/xfs/xfs_buf_item.h4
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_cil.c45
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_buf.c20
6 files changed, 74 insertions, 11 deletions
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index bcbb66150838..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -254,6 +254,20 @@ xfs_buf_item_format(
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
257 /*
258 * If it is an inode buffer, transfer the in-memory state to the
259 * format flags and clear the in-memory state. We do not transfer
260 * this state if the inode buffer allocation has not yet been committed
261 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
262 * correct replay of the inode allocation.
263 */
264 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
265 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
266 xfs_log_item_in_current_chkpt(&bip->bli_item)))
267 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
268 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
269 }
270
257 if (bip->bli_flags & XFS_BLI_STALE) { 271 if (bip->bli_flags & XFS_BLI_STALE) {
258 /* 272 /*
259 * The buffer is stale, so all we need to log 273 * The buffer is stale, so all we need to log
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 8cbb82b1d95c..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format {
69#define XFS_BLI_LOGGED 0x08 69#define XFS_BLI_LOGGED 0x08
70#define XFS_BLI_INODE_ALLOC_BUF 0x10 70#define XFS_BLI_INODE_ALLOC_BUF 0x10
71#define XFS_BLI_STALE_INODE 0x20 71#define XFS_BLI_STALE_INODE 0x20
72#define XFS_BLI_INODE_BUF 0x40
72 73
73#define XFS_BLI_FLAGS \ 74#define XFS_BLI_FLAGS \
74 { XFS_BLI_HOLD, "HOLD" }, \ 75 { XFS_BLI_HOLD, "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format {
76 { XFS_BLI_STALE, "STALE" }, \ 77 { XFS_BLI_STALE, "STALE" }, \
77 { XFS_BLI_LOGGED, "LOGGED" }, \ 78 { XFS_BLI_LOGGED, "LOGGED" }, \
78 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 79 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79 { XFS_BLI_STALE_INODE, "STALE_INODE" } 80 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
81 { XFS_BLI_INODE_BUF, "INODE_BUF" }
80 82
81 83
82#ifdef __KERNEL__ 84#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 4a0c57432e8f..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -198,6 +198,7 @@ xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
198int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 198int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
199 struct xfs_log_vec *log_vector, 199 struct xfs_log_vec *log_vector,
200 xfs_lsn_t *commit_lsn, int flags); 200 xfs_lsn_t *commit_lsn, int flags);
201bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
201 202
202#endif 203#endif
203 204
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9b21f80f31ce..bb17cc044bf3 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -200,6 +200,15 @@ xlog_cil_insert(
200 ctx->nvecs += diff_iovecs; 200 ctx->nvecs += diff_iovecs;
201 201
202 /* 202 /*
203 * If this is the first time the item is being committed to the CIL,
204 * store the sequence number on the log item so we can tell
205 * in future commits whether this is the first checkpoint the item is
206 * being committed into.
207 */
208 if (!item->li_seq)
209 item->li_seq = ctx->sequence;
210
211 /*
203 * Now transfer enough transaction reservation to the context ticket 212 * Now transfer enough transaction reservation to the context ticket
204 * for the checkpoint. The context ticket is special - the unit 213 * for the checkpoint. The context ticket is special - the unit
205 * reservation has to grow as well as the current reservation as we 214 * reservation has to grow as well as the current reservation as we
@@ -325,6 +334,10 @@ xlog_cil_free_logvec(
325 * For more specific information about the order of operations in 334 * For more specific information about the order of operations in
326 * xfs_log_commit_cil() please refer to the comments in 335 * xfs_log_commit_cil() please refer to the comments in
327 * xfs_trans_commit_iclog(). 336 * xfs_trans_commit_iclog().
337 *
338 * Called with the context lock already held in read mode to lock out
339 * background commit, returns without it held once background commits are
340 * allowed again.
328 */ 341 */
329int 342int
330xfs_log_commit_cil( 343xfs_log_commit_cil(
@@ -678,3 +691,35 @@ restart:
678 spin_unlock(&cil->xc_cil_lock); 691 spin_unlock(&cil->xc_cil_lock);
679 return commit_lsn; 692 return commit_lsn;
680} 693}
694
695/*
696 * Check if the current log item was first committed in this sequence.
697 * We can't rely on just the log item being in the CIL, we have to check
698 * the recorded commit sequence number.
699 *
700 * Note: for this to be used in a non-racy manner, it has to be called with
701 * CIL flushing locked out. As a result, it should only be used during the
702 * transaction commit process when deciding what to format into the item.
703 */
704bool
705xfs_log_item_in_current_chkpt(
706 struct xfs_log_item *lip)
707{
708 struct xfs_cil_ctx *ctx;
709
710 if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
711 return false;
712 if (list_empty(&lip->li_cil))
713 return false;
714
715 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
716
717 /*
718 * li_seq is written on the first commit of a log item to record the
719 * first checkpoint it is written to. Hence if it is different to the
720 * current sequence, we're in a new checkpoint.
721 */
722 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
723 return false;
724 return true;
725}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b1ea20c66b3e..8c69e7824f68 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -835,6 +835,7 @@ typedef struct xfs_log_item {
835 /* delayed logging */ 835 /* delayed logging */
836 struct list_head li_cil; /* CIL pointers */ 836 struct list_head li_cil; /* CIL pointers */
837 struct xfs_log_vec *li_lv; /* active log vector */ 837 struct xfs_log_vec *li_lv; /* active log vector */
838 xfs_lsn_t li_seq; /* CIL commit seq */
838} xfs_log_item_t; 839} xfs_log_item_t;
839 840
840#define XFS_LI_IN_AIL 0x1 841#define XFS_LI_IN_AIL 0x1
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 3390c3e7441b..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -792,7 +792,7 @@ xfs_trans_binval(
792 XFS_BUF_UNDELAYWRITE(bp); 792 XFS_BUF_UNDELAYWRITE(bp);
793 XFS_BUF_STALE(bp); 793 XFS_BUF_STALE(bp);
794 bip->bli_flags |= XFS_BLI_STALE; 794 bip->bli_flags |= XFS_BLI_STALE;
795 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); 795 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
796 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; 796 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
797 bip->bli_format.blf_flags |= XFS_BLF_CANCEL; 797 bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
798 memset((char *)(bip->bli_format.blf_data_map), 0, 798 memset((char *)(bip->bli_format.blf_data_map), 0,
@@ -802,16 +802,16 @@ xfs_trans_binval(
802} 802}
803 803
804/* 804/*
805 * This call is used to indicate that the buffer contains on-disk 805 * This call is used to indicate that the buffer contains on-disk inodes which
806 * inodes which must be handled specially during recovery. They 806 * must be handled specially during recovery. They require special handling
807 * require special handling because only the di_next_unlinked from 807 * because only the di_next_unlinked from the inodes in the buffer should be
808 * the inodes in the buffer should be recovered. The rest of the 808 * recovered. The rest of the data in the buffer is logged via the inodes
809 * data in the buffer is logged via the inodes themselves. 809 * themselves.
810 * 810 *
811 * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log 811 * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
812 * format structure so that we'll know what to do at recovery time. 812 * transferred to the buffer's log format structure so that we'll know what to
813 * do at recovery time.
813 */ 814 */
814/* ARGSUSED */
815void 815void
816xfs_trans_inode_buf( 816xfs_trans_inode_buf(
817 xfs_trans_t *tp, 817 xfs_trans_t *tp,
@@ -826,7 +826,7 @@ xfs_trans_inode_buf(
826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
827 ASSERT(atomic_read(&bip->bli_refcount) > 0); 827 ASSERT(atomic_read(&bip->bli_refcount) > 0);
828 828
829 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; 829 bip->bli_flags |= XFS_BLI_INODE_BUF;
830} 830}
831 831
832/* 832/*