xfs: clear BAD_SUMMARY if unmounting an unhealthy filesystem

If we know the filesystem metadata isn't healthy during unmount, we want to encourage the administrator to run xfs_repair right away. We can't do this if BAD_SUMMARY will cause an unclean log unmount to force summary recalculation, so turn it off if the fs is bad. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com>
author: Darrick J. Wong <darrick.wong@oracle.com> 2019-04-12 10:41:16 -0400
committer: Darrick J. Wong <darrick.wong@oracle.com> 2019-04-14 21:15:57 -0400
commit: 519841c207de9926418d2f39e162097088478781 (patch)
tree: f26f894e44f7d012ee7b981940b64454e1c7492a
parent: 39353ff6e96fb623230341ca89b0f4ef3a04998f (diff)
4 files changed, 81 insertions, 0 deletions
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 95053564a0d5..0915d20975be 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -118,6 +118,8 @@ void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
 void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
                unsigned int *checked);
+void xfs_health_unmount(struct xfs_mount *mp);
 /* Now some helpers. */
 static inline bool
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 941f33037e2f..21728228e08b 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -19,6 +19,80 @@
 #include "xfs_trace.h"
 #include "xfs_health.h"
+/*
+ * Warn about metadata corruption that we detected but haven't fixed, and
+ * make sure we're not sitting on anything that would get in the way of
+ * recovery.
+ */
+void
+xfs_health_unmount(
+        struct xfs_mount        *mp)
+{
+        struct xfs_perag        *pag;
+        xfs_agnumber_t          agno;
+        unsigned int            sick = 0;
+        unsigned int            checked = 0;
+        bool                    warn = false;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return;
+        /* Measure AG corruption levels. */
+        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+                pag = xfs_perag_get(mp, agno);
+                xfs_ag_measure_sickness(pag, &sick, &checked);
+                if (sick) {
+                        trace_xfs_ag_unfixed_corruption(mp, agno, sick);
+                        warn = true;
+                }
+                xfs_perag_put(pag);
+        }
+        /* Measure realtime volume corruption levels. */
+        xfs_rt_measure_sickness(mp, &sick, &checked);
+        if (sick) {
+                trace_xfs_rt_unfixed_corruption(mp, sick);
+                warn = true;
+        }
+        /*
+         * Measure fs corruption and keep the sample around for the warning.
+         * See the note below for why we exempt FS_COUNTERS.
+         */
+        xfs_fs_measure_sickness(mp, &sick, &checked);
+        if (sick & ~XFS_SICK_FS_COUNTERS) {
+                trace_xfs_fs_unfixed_corruption(mp, sick);
+                warn = true;
+        }
+        if (warn) {
+                xfs_warn(mp,
+"Uncorrected metadata errors detected; please run xfs_repair.");
+                /*
+                 * We discovered uncorrected metadata problems at some point
+                 * during this filesystem mount and have advised the
+                 * administrator to run repair once the unmount completes.
+                 *
+                 * However, we must be careful -- when FSCOUNTERS are flagged
+                 * unhealthy, the unmount procedure omits writing the clean
+                 * unmount record to the log so that the next mount will run
+                 * recovery and recompute the summary counters.  In other
+                 * words, we leave a dirty log to get the counters fixed.
+                 *
+                 * Unfortunately, xfs_repair cannot recover dirty logs, so if
+                 * there were filesystem problems, FSCOUNTERS was flagged, and
+                 * the administrator takes our advice to run xfs_repair,
+                 * they'll have to zap the log before repairing structures.
+                 * We don't really want to encourage this, so we mark the
+                 * FSCOUNTERS healthy so that a subsequent repair run won't see
+                 * a dirty log.
+                 */
+                if (sick & XFS_SICK_FS_COUNTERS)
+                        xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
+        }
+}
 /* Mark unhealthy per-fs metadata. */
 void
 xfs_fs_mark_sick(
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 14f454e09e6e..eff8b4c3eb3e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1070,6 +1070,7 @@ xfs_mountfs(
         */
        cancel_delayed_work_sync(&mp->m_reclaim_work);
        xfs_reclaim_inodes(mp, SYNC_WAIT);
+        xfs_health_unmount(mp);
 out_log_dealloc:
        mp->m_flags |= XFS_MOUNT_UNMOUNTING;
        xfs_log_mount_cancel(mp);
@@ -1152,6 +1153,7 @@ xfs_unmountfs(
         */
        cancel_delayed_work_sync(&mp->m_reclaim_work);
        xfs_reclaim_inodes(mp, SYNC_WAIT);
+        xfs_health_unmount(mp);
        xfs_qm_unmount(mp);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f079841c7af6..2464ea351f83 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3461,8 +3461,10 @@ DEFINE_EVENT(xfs_fs_corrupt_class, name,	\
        TP_ARGS(mp, flags))
 DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
 DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
 DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
 DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
 DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
@@ -3488,6 +3490,7 @@ DEFINE_EVENT(xfs_ag_corrupt_class, name,	\
        TP_ARGS(mp, agno, flags))
 DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
 DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
 DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
        TP_PROTO(struct xfs_inode *ip, unsigned int flags),
author	Darrick J. Wong <darrick.wong@oracle.com>	2019-04-12 10:41:16 -0400
committer	Darrick J. Wong <darrick.wong@oracle.com>	2019-04-14 21:15:57 -0400
commit	519841c207de9926418d2f39e162097088478781 (patch)
tree	f26f894e44f7d012ee7b981940b64454e1c7492a
parent	39353ff6e96fb623230341ca89b0f4ef3a04998f (diff)

diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 95053564a0d5..0915d20975be 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h
@@ -118,6 +118,8 @@ void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
118	void xfs_inode_measure_sickness(struct xfs_inode ip, unsigned int sick,	118	void xfs_inode_measure_sickness(struct xfs_inode ip, unsigned int sick,
119	unsigned int *checked);	119	unsigned int *checked);
120		120
		121	void xfs_health_unmount(struct xfs_mount *mp);
		122
121	/* Now some helpers. */	123	/* Now some helpers. */
122		124
123	static inline bool	125	static inline bool


diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 941f33037e2f..21728228e08b 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c
@@ -19,6 +19,80 @@
19	#include "xfs_trace.h"	19	#include "xfs_trace.h"
20	#include "xfs_health.h"	20	#include "xfs_health.h"
21		21
		22	/*
		23	* Warn about metadata corruption that we detected but haven't fixed, and
		24	* make sure we're not sitting on anything that would get in the way of
		25	* recovery.
		26	*/
		27	void
		28	xfs_health_unmount(
		29	struct xfs_mount *mp)
		30	{
		31	struct xfs_perag *pag;
		32	xfs_agnumber_t agno;
		33	unsigned int sick = 0;
		34	unsigned int checked = 0;
		35	bool warn = false;
		36
		37	if (XFS_FORCED_SHUTDOWN(mp))
		38	return;
		39
		40	/* Measure AG corruption levels. */
		41	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
		42	pag = xfs_perag_get(mp, agno);
		43	xfs_ag_measure_sickness(pag, &sick, &checked);
		44	if (sick) {
		45	trace_xfs_ag_unfixed_corruption(mp, agno, sick);
		46	warn = true;
		47	}
		48	xfs_perag_put(pag);
		49	}
		50
		51	/* Measure realtime volume corruption levels. */
		52	xfs_rt_measure_sickness(mp, &sick, &checked);
		53	if (sick) {
		54	trace_xfs_rt_unfixed_corruption(mp, sick);
		55	warn = true;
		56	}
		57
		58	/*
		59	* Measure fs corruption and keep the sample around for the warning.
		60	* See the note below for why we exempt FS_COUNTERS.
		61	*/
		62	xfs_fs_measure_sickness(mp, &sick, &checked);
		63	if (sick & ~XFS_SICK_FS_COUNTERS) {
		64	trace_xfs_fs_unfixed_corruption(mp, sick);
		65	warn = true;
		66	}
		67
		68	if (warn) {
		69	xfs_warn(mp,
		70	"Uncorrected metadata errors detected; please run xfs_repair.");
		71
		72	/*
		73	* We discovered uncorrected metadata problems at some point
		74	* during this filesystem mount and have advised the
		75	* administrator to run repair once the unmount completes.
		76	*
		77	* However, we must be careful -- when FSCOUNTERS are flagged
		78	* unhealthy, the unmount procedure omits writing the clean
		79	* unmount record to the log so that the next mount will run
		80	* recovery and recompute the summary counters. In other
		81	* words, we leave a dirty log to get the counters fixed.
		82	*
		83	* Unfortunately, xfs_repair cannot recover dirty logs, so if
		84	* there were filesystem problems, FSCOUNTERS was flagged, and
		85	* the administrator takes our advice to run xfs_repair,
		86	* they'll have to zap the log before repairing structures.
		87	* We don't really want to encourage this, so we mark the
		88	* FSCOUNTERS healthy so that a subsequent repair run won't see
		89	* a dirty log.
		90	*/
		91	if (sick & XFS_SICK_FS_COUNTERS)
		92	xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
		93	}
		94	}
		95
22	/* Mark unhealthy per-fs metadata. */	96	/* Mark unhealthy per-fs metadata. */
23	void	97	void
24	xfs_fs_mark_sick(	98	xfs_fs_mark_sick(


diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 14f454e09e6e..eff8b4c3eb3e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c
@@ -1070,6 +1070,7 @@ xfs_mountfs(
1070	*/	1070	*/
1071	cancel_delayed_work_sync(&mp->m_reclaim_work);	1071	cancel_delayed_work_sync(&mp->m_reclaim_work);
1072	xfs_reclaim_inodes(mp, SYNC_WAIT);	1072	xfs_reclaim_inodes(mp, SYNC_WAIT);
		1073	xfs_health_unmount(mp);
1073	out_log_dealloc:	1074	out_log_dealloc:
1074	mp->m_flags \|= XFS_MOUNT_UNMOUNTING;	1075	mp->m_flags \|= XFS_MOUNT_UNMOUNTING;
1075	xfs_log_mount_cancel(mp);	1076	xfs_log_mount_cancel(mp);
@@ -1152,6 +1153,7 @@ xfs_unmountfs(
1152	*/	1153	*/
1153	cancel_delayed_work_sync(&mp->m_reclaim_work);	1154	cancel_delayed_work_sync(&mp->m_reclaim_work);
1154	xfs_reclaim_inodes(mp, SYNC_WAIT);	1155	xfs_reclaim_inodes(mp, SYNC_WAIT);
		1156	xfs_health_unmount(mp);
1155		1157
1156	xfs_qm_unmount(mp);	1158	xfs_qm_unmount(mp);
1157		1159


diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f079841c7af6..2464ea351f83 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h
@@ -3461,8 +3461,10 @@ DEFINE_EVENT(xfs_fs_corrupt_class, name, \
3461	TP_ARGS(mp, flags))	3461	TP_ARGS(mp, flags))
3462	DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);	3462	DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
3463	DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);	3463	DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
		3464	DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
3464	DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);	3465	DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
3465	DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);	3466	DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
		3467	DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
3466		3468
3467	DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,	3469	DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
3468	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),	3470	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
@@ -3488,6 +3490,7 @@ DEFINE_EVENT(xfs_ag_corrupt_class, name, \
3488	TP_ARGS(mp, agno, flags))	3490	TP_ARGS(mp, agno, flags))
3489	DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);	3491	DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
3490	DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);	3492	DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
		3493	DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
3491		3494
3492	DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,	3495	DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
3493	TP_PROTO(struct xfs_inode *ip, unsigned int flags),	3496	TP_PROTO(struct xfs_inode *ip, unsigned int flags),