diff options
author | Christoph Hellwig <hch@infradead.org> | 2011-01-07 08:02:23 -0500 |
---|---|---|
committer | Alex Elder <aelder@sgi.com> | 2011-01-11 21:28:42 -0500 |
commit | bfc60177f8ab509bc225becbb58f7e53a0e33e81 (patch) | |
tree | d5475852558efd0e06c698f4d64926581134b0fb /fs | |
parent | a46db60834883c1c8c665d7fcc7b4ab66f5966fc (diff) |
xfs: fix error handling for synchronous writes
If we get an IO error on a synchronous superblock write, we attach an
error release function to it so that when the last reference goes away
the release function is called and the buffer is invalidated and
unlocked. The buffer is left locked until the release function is
called so that other concurrent users of the buffer will be locked out
until the buffer error is fully processed.
Unfortunately, for the superblock buffer the filesyetm itself holds a
reference to the buffer which prevents the reference count from
dropping to zero and the release function being called. As a result,
once an IO error occurs on a sync write, the buffer will never be
unlocked and all future attempts to lock the buffer will hang.
To make matters worse, this problems is not unique to such buffers;
if there is a concurrent _xfs_buf_find() running, the lookup will grab
a reference to the buffer and then wait on the buffer lock, preventing
the reference count from ever falling to zero and hence unlocking the
buffer.
As such, the whole b_relse function implementation is broken because it
cannot rely on the buffer reference count falling to zero to unlock the
errored buffer. The synchronous write error path is the only path that
uses this callback - it is used to ensure that the synchronous waiter
gets the buffer error before the error state is cleared from the buffer
by the release function.
Given that the only sychronous buffer writes now go through xfs_bwrite
and the error path in question can only occur for a write of a dirty,
logged buffer, we can move most of the b_relse processing to happen
inline in xfs_buf_iodone_callbacks, just like a normal I/O completion.
In addition to that we make sure the error is not cleared in
xfs_buf_iodone_callbacks, so that xfs_bwrite can reliably check it.
Given that xfs_bwrite keeps the buffer locked until it has waited for
it and checked the error this allows to reliably propagate the error
to the caller, and make sure that the buffer is reliably unlocked.
Given that xfs_buf_iodone_callbacks was the only instance of the
b_relse callback we can remove it entirely.
Based on earlier patches by Dave Chinner and Ajeet Yadav.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Ajeet Yadav <ajeet.yadav.77@gmail.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.c | 7 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.h | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.c | 151 |
3 files changed, 51 insertions, 114 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 92f1f2acc6ab..ac1c7e8378dd 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c | |||
@@ -896,7 +896,6 @@ xfs_buf_rele( | |||
896 | trace_xfs_buf_rele(bp, _RET_IP_); | 896 | trace_xfs_buf_rele(bp, _RET_IP_); |
897 | 897 | ||
898 | if (!pag) { | 898 | if (!pag) { |
899 | ASSERT(!bp->b_relse); | ||
900 | ASSERT(list_empty(&bp->b_lru)); | 899 | ASSERT(list_empty(&bp->b_lru)); |
901 | ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); | 900 | ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); |
902 | if (atomic_dec_and_test(&bp->b_hold)) | 901 | if (atomic_dec_and_test(&bp->b_hold)) |
@@ -908,11 +907,7 @@ xfs_buf_rele( | |||
908 | 907 | ||
909 | ASSERT(atomic_read(&bp->b_hold) > 0); | 908 | ASSERT(atomic_read(&bp->b_hold) > 0); |
910 | if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { | 909 | if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { |
911 | if (bp->b_relse) { | 910 | if (!(bp->b_flags & XBF_STALE) && |
912 | atomic_inc(&bp->b_hold); | ||
913 | spin_unlock(&pag->pag_buf_lock); | ||
914 | bp->b_relse(bp); | ||
915 | } else if (!(bp->b_flags & XBF_STALE) && | ||
916 | atomic_read(&bp->b_lru_ref)) { | 911 | atomic_read(&bp->b_lru_ref)) { |
917 | xfs_buf_lru_add(bp); | 912 | xfs_buf_lru_add(bp); |
918 | spin_unlock(&pag->pag_buf_lock); | 913 | spin_unlock(&pag->pag_buf_lock); |
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a76c2428faff..cbe65950e524 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h | |||
@@ -152,8 +152,6 @@ typedef struct xfs_buftarg { | |||
152 | 152 | ||
153 | struct xfs_buf; | 153 | struct xfs_buf; |
154 | typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); | 154 | typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); |
155 | typedef void (*xfs_buf_relse_t)(struct xfs_buf *); | ||
156 | typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); | ||
157 | 155 | ||
158 | #define XB_PAGES 2 | 156 | #define XB_PAGES 2 |
159 | 157 | ||
@@ -183,7 +181,6 @@ typedef struct xfs_buf { | |||
183 | void *b_addr; /* virtual address of buffer */ | 181 | void *b_addr; /* virtual address of buffer */ |
184 | struct work_struct b_iodone_work; | 182 | struct work_struct b_iodone_work; |
185 | xfs_buf_iodone_t b_iodone; /* I/O completion function */ | 183 | xfs_buf_iodone_t b_iodone; /* I/O completion function */ |
186 | xfs_buf_relse_t b_relse; /* releasing function */ | ||
187 | struct completion b_iowait; /* queue for I/O waiters */ | 184 | struct completion b_iowait; /* queue for I/O waiters */ |
188 | void *b_fspriv; | 185 | void *b_fspriv; |
189 | void *b_fspriv2; | 186 | void *b_fspriv2; |
@@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp); | |||
323 | #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) | 320 | #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) |
324 | #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) | 321 | #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) |
325 | #define XFS_BUF_SET_START(bp) do { } while (0) | 322 | #define XFS_BUF_SET_START(bp) do { } while (0) |
326 | #define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) | ||
327 | 323 | ||
328 | #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) | 324 | #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) |
329 | #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) | 325 | #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) |
@@ -360,8 +356,7 @@ xfs_buf_set_ref( | |||
360 | 356 | ||
361 | static inline void xfs_buf_relse(xfs_buf_t *bp) | 357 | static inline void xfs_buf_relse(xfs_buf_t *bp) |
362 | { | 358 | { |
363 | if (!bp->b_relse) | 359 | xfs_buf_unlock(bp); |
364 | xfs_buf_unlock(bp); | ||
365 | xfs_buf_rele(bp); | 360 | xfs_buf_rele(bp); |
366 | } | 361 | } |
367 | 362 | ||
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ed2b65f3f8b9..98c6f73b6752 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c | |||
@@ -141,7 +141,6 @@ xfs_buf_item_log_check( | |||
141 | #define xfs_buf_item_log_check(x) | 141 | #define xfs_buf_item_log_check(x) |
142 | #endif | 142 | #endif |
143 | 143 | ||
144 | STATIC void xfs_buf_error_relse(xfs_buf_t *bp); | ||
145 | STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); | 144 | STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); |
146 | 145 | ||
147 | /* | 146 | /* |
@@ -959,128 +958,76 @@ xfs_buf_do_callbacks( | |||
959 | */ | 958 | */ |
960 | void | 959 | void |
961 | xfs_buf_iodone_callbacks( | 960 | xfs_buf_iodone_callbacks( |
962 | xfs_buf_t *bp) | 961 | struct xfs_buf *bp) |
963 | { | 962 | { |
964 | xfs_log_item_t *lip; | 963 | struct xfs_log_item *lip = bp->b_fspriv; |
965 | static ulong lasttime; | 964 | struct xfs_mount *mp = lip->li_mountp; |
966 | static xfs_buftarg_t *lasttarg; | 965 | static ulong lasttime; |
967 | xfs_mount_t *mp; | 966 | static xfs_buftarg_t *lasttarg; |
968 | 967 | ||
969 | ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); | 968 | if (likely(!XFS_BUF_GETERROR(bp))) |
970 | lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); | 969 | goto do_callbacks; |
971 | 970 | ||
972 | if (XFS_BUF_GETERROR(bp) != 0) { | 971 | /* |
973 | /* | 972 | * If we've already decided to shutdown the filesystem because of |
974 | * If we've already decided to shutdown the filesystem | 973 | * I/O errors, there's no point in giving this a retry. |
975 | * because of IO errors, there's no point in giving this | 974 | */ |
976 | * a retry. | 975 | if (XFS_FORCED_SHUTDOWN(mp)) { |
977 | */ | 976 | XFS_BUF_SUPER_STALE(bp); |
978 | mp = lip->li_mountp; | 977 | trace_xfs_buf_item_iodone(bp, _RET_IP_); |
979 | if (XFS_FORCED_SHUTDOWN(mp)) { | 978 | goto do_callbacks; |
980 | ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); | 979 | } |
981 | XFS_BUF_SUPER_STALE(bp); | ||
982 | trace_xfs_buf_item_iodone(bp, _RET_IP_); | ||
983 | xfs_buf_do_callbacks(bp); | ||
984 | XFS_BUF_SET_FSPRIVATE(bp, NULL); | ||
985 | XFS_BUF_CLR_IODONE_FUNC(bp); | ||
986 | xfs_buf_ioend(bp, 0); | ||
987 | return; | ||
988 | } | ||
989 | 980 | ||
990 | if ((XFS_BUF_TARGET(bp) != lasttarg) || | 981 | if (XFS_BUF_TARGET(bp) != lasttarg || |
991 | (time_after(jiffies, (lasttime + 5*HZ)))) { | 982 | time_after(jiffies, (lasttime + 5*HZ))) { |
992 | lasttime = jiffies; | 983 | lasttime = jiffies; |
993 | cmn_err(CE_ALERT, "Device %s, XFS metadata write error" | 984 | cmn_err(CE_ALERT, "Device %s, XFS metadata write error" |
994 | " block 0x%llx in %s", | 985 | " block 0x%llx in %s", |
995 | XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), | 986 | XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), |
996 | (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); | 987 | (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); |
997 | } | 988 | } |
998 | lasttarg = XFS_BUF_TARGET(bp); | 989 | lasttarg = XFS_BUF_TARGET(bp); |
999 | 990 | ||
1000 | if (XFS_BUF_ISASYNC(bp)) { | 991 | /* |
1001 | /* | 992 | * If the write was asynchronous then noone will be looking for the |
1002 | * If the write was asynchronous then noone will be | 993 | * error. Clear the error state and write the buffer out again. |
1003 | * looking for the error. Clear the error state | 994 | * |
1004 | * and write the buffer out again delayed write. | 995 | * During sync or umount we'll write all pending buffers again |
1005 | * | 996 | * synchronous, which will catch these errors if they keep hanging |
1006 | * XXXsup This is OK, so long as we catch these | 997 | * around. |
1007 | * before we start the umount; we don't want these | 998 | */ |
1008 | * DELWRI metadata bufs to be hanging around. | 999 | if (XFS_BUF_ISASYNC(bp)) { |
1009 | */ | 1000 | XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ |
1010 | XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ | 1001 | |
1011 | 1002 | if (!XFS_BUF_ISSTALE(bp)) { | |
1012 | if (!(XFS_BUF_ISSTALE(bp))) { | 1003 | XFS_BUF_DELAYWRITE(bp); |
1013 | XFS_BUF_DELAYWRITE(bp); | ||
1014 | XFS_BUF_DONE(bp); | ||
1015 | XFS_BUF_SET_START(bp); | ||
1016 | } | ||
1017 | ASSERT(XFS_BUF_IODONE_FUNC(bp)); | ||
1018 | trace_xfs_buf_item_iodone_async(bp, _RET_IP_); | ||
1019 | xfs_buf_relse(bp); | ||
1020 | } else { | ||
1021 | /* | ||
1022 | * If the write of the buffer was not asynchronous, | ||
1023 | * then we want to make sure to return the error | ||
1024 | * to the caller of bwrite(). Because of this we | ||
1025 | * cannot clear the B_ERROR state at this point. | ||
1026 | * Instead we install a callback function that | ||
1027 | * will be called when the buffer is released, and | ||
1028 | * that routine will clear the error state and | ||
1029 | * set the buffer to be written out again after | ||
1030 | * some delay. | ||
1031 | */ | ||
1032 | /* We actually overwrite the existing b-relse | ||
1033 | function at times, but we're gonna be shutting down | ||
1034 | anyway. */ | ||
1035 | XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); | ||
1036 | XFS_BUF_DONE(bp); | 1004 | XFS_BUF_DONE(bp); |
1037 | XFS_BUF_FINISH_IOWAIT(bp); | 1005 | XFS_BUF_SET_START(bp); |
1038 | } | 1006 | } |
1007 | ASSERT(XFS_BUF_IODONE_FUNC(bp)); | ||
1008 | trace_xfs_buf_item_iodone_async(bp, _RET_IP_); | ||
1009 | xfs_buf_relse(bp); | ||
1039 | return; | 1010 | return; |
1040 | } | 1011 | } |
1041 | 1012 | ||
1042 | xfs_buf_do_callbacks(bp); | 1013 | /* |
1043 | XFS_BUF_SET_FSPRIVATE(bp, NULL); | 1014 | * If the write of the buffer was synchronous, we want to make |
1044 | XFS_BUF_CLR_IODONE_FUNC(bp); | 1015 | * sure to return the error to the caller of xfs_bwrite(). |
1045 | xfs_buf_ioend(bp, 0); | 1016 | */ |
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * This is a callback routine attached to a buffer which gets an error | ||
1050 | * when being written out synchronously. | ||
1051 | */ | ||
1052 | STATIC void | ||
1053 | xfs_buf_error_relse( | ||
1054 | xfs_buf_t *bp) | ||
1055 | { | ||
1056 | xfs_log_item_t *lip; | ||
1057 | xfs_mount_t *mp; | ||
1058 | |||
1059 | lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); | ||
1060 | mp = (xfs_mount_t *)lip->li_mountp; | ||
1061 | ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); | ||
1062 | |||
1063 | XFS_BUF_STALE(bp); | 1017 | XFS_BUF_STALE(bp); |
1064 | XFS_BUF_DONE(bp); | 1018 | XFS_BUF_DONE(bp); |
1065 | XFS_BUF_UNDELAYWRITE(bp); | 1019 | XFS_BUF_UNDELAYWRITE(bp); |
1066 | XFS_BUF_ERROR(bp,0); | ||
1067 | 1020 | ||
1068 | trace_xfs_buf_error_relse(bp, _RET_IP_); | 1021 | trace_xfs_buf_error_relse(bp, _RET_IP_); |
1022 | xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); | ||
1069 | 1023 | ||
1070 | if (! XFS_FORCED_SHUTDOWN(mp)) | 1024 | do_callbacks: |
1071 | xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); | ||
1072 | /* | ||
1073 | * We have to unpin the pinned buffers so do the | ||
1074 | * callbacks. | ||
1075 | */ | ||
1076 | xfs_buf_do_callbacks(bp); | 1025 | xfs_buf_do_callbacks(bp); |
1077 | XFS_BUF_SET_FSPRIVATE(bp, NULL); | 1026 | XFS_BUF_SET_FSPRIVATE(bp, NULL); |
1078 | XFS_BUF_CLR_IODONE_FUNC(bp); | 1027 | XFS_BUF_CLR_IODONE_FUNC(bp); |
1079 | XFS_BUF_SET_BRELSE_FUNC(bp,NULL); | 1028 | xfs_buf_ioend(bp, 0); |
1080 | xfs_buf_relse(bp); | ||
1081 | } | 1029 | } |
1082 | 1030 | ||
1083 | |||
1084 | /* | 1031 | /* |
1085 | * This is the iodone() function for buffers which have been | 1032 | * This is the iodone() function for buffers which have been |
1086 | * logged. It is called when they are eventually flushed out. | 1033 | * logged. It is called when they are eventually flushed out. |