aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2011-01-07 08:02:23 -0500
committerAlex Elder <aelder@sgi.com>2011-01-11 21:28:42 -0500
commitbfc60177f8ab509bc225becbb58f7e53a0e33e81 (patch)
treed5475852558efd0e06c698f4d64926581134b0fb /fs
parenta46db60834883c1c8c665d7fcc7b4ab66f5966fc (diff)
xfs: fix error handling for synchronous writes
If we get an IO error on a synchronous superblock write, we attach an error release function to it so that when the last reference goes away the release function is called and the buffer is invalidated and unlocked. The buffer is left locked until the release function is called so that other concurrent users of the buffer will be locked out until the buffer error is fully processed. Unfortunately, for the superblock buffer the filesyetm itself holds a reference to the buffer which prevents the reference count from dropping to zero and the release function being called. As a result, once an IO error occurs on a sync write, the buffer will never be unlocked and all future attempts to lock the buffer will hang. To make matters worse, this problems is not unique to such buffers; if there is a concurrent _xfs_buf_find() running, the lookup will grab a reference to the buffer and then wait on the buffer lock, preventing the reference count from ever falling to zero and hence unlocking the buffer. As such, the whole b_relse function implementation is broken because it cannot rely on the buffer reference count falling to zero to unlock the errored buffer. The synchronous write error path is the only path that uses this callback - it is used to ensure that the synchronous waiter gets the buffer error before the error state is cleared from the buffer by the release function. Given that the only sychronous buffer writes now go through xfs_bwrite and the error path in question can only occur for a write of a dirty, logged buffer, we can move most of the b_relse processing to happen inline in xfs_buf_iodone_callbacks, just like a normal I/O completion. In addition to that we make sure the error is not cleared in xfs_buf_iodone_callbacks, so that xfs_bwrite can reliably check it. Given that xfs_bwrite keeps the buffer locked until it has waited for it and checked the error this allows to reliably propagate the error to the caller, and make sure that the buffer is reliably unlocked. Given that xfs_buf_iodone_callbacks was the only instance of the b_relse callback we can remove it entirely. Based on earlier patches by Dave Chinner and Ajeet Yadav. Signed-off-by: Christoph Hellwig <hch@lst.de> Reported-by: Ajeet Yadav <ajeet.yadav.77@gmail.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h7
-rw-r--r--fs/xfs/xfs_buf_item.c151
3 files changed, 51 insertions, 114 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 92f1f2acc6ab..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -896,7 +896,6 @@ xfs_buf_rele(
896 trace_xfs_buf_rele(bp, _RET_IP_); 896 trace_xfs_buf_rele(bp, _RET_IP_);
897 897
898 if (!pag) { 898 if (!pag) {
899 ASSERT(!bp->b_relse);
900 ASSERT(list_empty(&bp->b_lru)); 899 ASSERT(list_empty(&bp->b_lru));
901 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 900 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
902 if (atomic_dec_and_test(&bp->b_hold)) 901 if (atomic_dec_and_test(&bp->b_hold))
@@ -908,11 +907,7 @@ xfs_buf_rele(
908 907
909 ASSERT(atomic_read(&bp->b_hold) > 0); 908 ASSERT(atomic_read(&bp->b_hold) > 0);
910 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 909 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
911 if (bp->b_relse) { 910 if (!(bp->b_flags & XBF_STALE) &&
912 atomic_inc(&bp->b_hold);
913 spin_unlock(&pag->pag_buf_lock);
914 bp->b_relse(bp);
915 } else if (!(bp->b_flags & XBF_STALE) &&
916 atomic_read(&bp->b_lru_ref)) { 911 atomic_read(&bp->b_lru_ref)) {
917 xfs_buf_lru_add(bp); 912 xfs_buf_lru_add(bp);
918 spin_unlock(&pag->pag_buf_lock); 913 spin_unlock(&pag->pag_buf_lock);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a76c2428faff..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -152,8 +152,6 @@ typedef struct xfs_buftarg {
152 152
153struct xfs_buf; 153struct xfs_buf;
154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
155typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
156typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
157 155
158#define XB_PAGES 2 156#define XB_PAGES 2
159 157
@@ -183,7 +181,6 @@ typedef struct xfs_buf {
183 void *b_addr; /* virtual address of buffer */ 181 void *b_addr; /* virtual address of buffer */
184 struct work_struct b_iodone_work; 182 struct work_struct b_iodone_work;
185 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 183 xfs_buf_iodone_t b_iodone; /* I/O completion function */
186 xfs_buf_relse_t b_relse; /* releasing function */
187 struct completion b_iowait; /* queue for I/O waiters */ 184 struct completion b_iowait; /* queue for I/O waiters */
188 void *b_fspriv; 185 void *b_fspriv;
189 void *b_fspriv2; 186 void *b_fspriv2;
@@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
323#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 320#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
324#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 321#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
325#define XFS_BUF_SET_START(bp) do { } while (0) 322#define XFS_BUF_SET_START(bp) do { } while (0)
326#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
327 323
328#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 324#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
329#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 325#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -360,8 +356,7 @@ xfs_buf_set_ref(
360 356
361static inline void xfs_buf_relse(xfs_buf_t *bp) 357static inline void xfs_buf_relse(xfs_buf_t *bp)
362{ 358{
363 if (!bp->b_relse) 359 xfs_buf_unlock(bp);
364 xfs_buf_unlock(bp);
365 xfs_buf_rele(bp); 360 xfs_buf_rele(bp);
366} 361}
367 362
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index ed2b65f3f8b9..98c6f73b6752 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,7 +141,6 @@ xfs_buf_item_log_check(
141#define xfs_buf_item_log_check(x) 141#define xfs_buf_item_log_check(x)
142#endif 142#endif
143 143
144STATIC void xfs_buf_error_relse(xfs_buf_t *bp);
145STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); 144STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
146 145
147/* 146/*
@@ -959,128 +958,76 @@ xfs_buf_do_callbacks(
959 */ 958 */
960void 959void
961xfs_buf_iodone_callbacks( 960xfs_buf_iodone_callbacks(
962 xfs_buf_t *bp) 961 struct xfs_buf *bp)
963{ 962{
964 xfs_log_item_t *lip; 963 struct xfs_log_item *lip = bp->b_fspriv;
965 static ulong lasttime; 964 struct xfs_mount *mp = lip->li_mountp;
966 static xfs_buftarg_t *lasttarg; 965 static ulong lasttime;
967 xfs_mount_t *mp; 966 static xfs_buftarg_t *lasttarg;
968 967
969 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 968 if (likely(!XFS_BUF_GETERROR(bp)))
970 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 969 goto do_callbacks;
971 970
972 if (XFS_BUF_GETERROR(bp) != 0) { 971 /*
973 /* 972 * If we've already decided to shutdown the filesystem because of
974 * If we've already decided to shutdown the filesystem 973 * I/O errors, there's no point in giving this a retry.
975 * because of IO errors, there's no point in giving this 974 */
976 * a retry. 975 if (XFS_FORCED_SHUTDOWN(mp)) {
977 */ 976 XFS_BUF_SUPER_STALE(bp);
978 mp = lip->li_mountp; 977 trace_xfs_buf_item_iodone(bp, _RET_IP_);
979 if (XFS_FORCED_SHUTDOWN(mp)) { 978 goto do_callbacks;
980 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 979 }
981 XFS_BUF_SUPER_STALE(bp);
982 trace_xfs_buf_item_iodone(bp, _RET_IP_);
983 xfs_buf_do_callbacks(bp);
984 XFS_BUF_SET_FSPRIVATE(bp, NULL);
985 XFS_BUF_CLR_IODONE_FUNC(bp);
986 xfs_buf_ioend(bp, 0);
987 return;
988 }
989 980
990 if ((XFS_BUF_TARGET(bp) != lasttarg) || 981 if (XFS_BUF_TARGET(bp) != lasttarg ||
991 (time_after(jiffies, (lasttime + 5*HZ)))) { 982 time_after(jiffies, (lasttime + 5*HZ))) {
992 lasttime = jiffies; 983 lasttime = jiffies;
993 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 984 cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
994 " block 0x%llx in %s", 985 " block 0x%llx in %s",
995 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 986 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
996 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 987 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
997 } 988 }
998 lasttarg = XFS_BUF_TARGET(bp); 989 lasttarg = XFS_BUF_TARGET(bp);
999 990
1000 if (XFS_BUF_ISASYNC(bp)) { 991 /*
1001 /* 992 * If the write was asynchronous then noone will be looking for the
1002 * If the write was asynchronous then noone will be 993 * error. Clear the error state and write the buffer out again.
1003 * looking for the error. Clear the error state 994 *
1004 * and write the buffer out again delayed write. 995 * During sync or umount we'll write all pending buffers again
1005 * 996 * synchronous, which will catch these errors if they keep hanging
1006 * XXXsup This is OK, so long as we catch these 997 * around.
1007 * before we start the umount; we don't want these 998 */
1008 * DELWRI metadata bufs to be hanging around. 999 if (XFS_BUF_ISASYNC(bp)) {
1009 */ 1000 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
1010 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ 1001
1011 1002 if (!XFS_BUF_ISSTALE(bp)) {
1012 if (!(XFS_BUF_ISSTALE(bp))) { 1003 XFS_BUF_DELAYWRITE(bp);
1013 XFS_BUF_DELAYWRITE(bp);
1014 XFS_BUF_DONE(bp);
1015 XFS_BUF_SET_START(bp);
1016 }
1017 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1018 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1019 xfs_buf_relse(bp);
1020 } else {
1021 /*
1022 * If the write of the buffer was not asynchronous,
1023 * then we want to make sure to return the error
1024 * to the caller of bwrite(). Because of this we
1025 * cannot clear the B_ERROR state at this point.
1026 * Instead we install a callback function that
1027 * will be called when the buffer is released, and
1028 * that routine will clear the error state and
1029 * set the buffer to be written out again after
1030 * some delay.
1031 */
1032 /* We actually overwrite the existing b-relse
1033 function at times, but we're gonna be shutting down
1034 anyway. */
1035 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1036 XFS_BUF_DONE(bp); 1004 XFS_BUF_DONE(bp);
1037 XFS_BUF_FINISH_IOWAIT(bp); 1005 XFS_BUF_SET_START(bp);
1038 } 1006 }
1007 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1008 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1009 xfs_buf_relse(bp);
1039 return; 1010 return;
1040 } 1011 }
1041 1012
1042 xfs_buf_do_callbacks(bp); 1013 /*
1043 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1014 * If the write of the buffer was synchronous, we want to make
1044 XFS_BUF_CLR_IODONE_FUNC(bp); 1015 * sure to return the error to the caller of xfs_bwrite().
1045 xfs_buf_ioend(bp, 0); 1016 */
1046}
1047
1048/*
1049 * This is a callback routine attached to a buffer which gets an error
1050 * when being written out synchronously.
1051 */
1052STATIC void
1053xfs_buf_error_relse(
1054 xfs_buf_t *bp)
1055{
1056 xfs_log_item_t *lip;
1057 xfs_mount_t *mp;
1058
1059 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1060 mp = (xfs_mount_t *)lip->li_mountp;
1061 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1062
1063 XFS_BUF_STALE(bp); 1017 XFS_BUF_STALE(bp);
1064 XFS_BUF_DONE(bp); 1018 XFS_BUF_DONE(bp);
1065 XFS_BUF_UNDELAYWRITE(bp); 1019 XFS_BUF_UNDELAYWRITE(bp);
1066 XFS_BUF_ERROR(bp,0);
1067 1020
1068 trace_xfs_buf_error_relse(bp, _RET_IP_); 1021 trace_xfs_buf_error_relse(bp, _RET_IP_);
1022 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1069 1023
1070 if (! XFS_FORCED_SHUTDOWN(mp)) 1024do_callbacks:
1071 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1072 /*
1073 * We have to unpin the pinned buffers so do the
1074 * callbacks.
1075 */
1076 xfs_buf_do_callbacks(bp); 1025 xfs_buf_do_callbacks(bp);
1077 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1026 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1078 XFS_BUF_CLR_IODONE_FUNC(bp); 1027 XFS_BUF_CLR_IODONE_FUNC(bp);
1079 XFS_BUF_SET_BRELSE_FUNC(bp,NULL); 1028 xfs_buf_ioend(bp, 0);
1080 xfs_buf_relse(bp);
1081} 1029}
1082 1030
1083
1084/* 1031/*
1085 * This is the iodone() function for buffers which have been 1032 * This is the iodone() function for buffers which have been
1086 * logged. It is called when they are eventually flushed out. 1033 * logged. It is called when they are eventually flushed out.