summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2019-09-06 00:35:39 -0400
committerDarrick J. Wong <darrick.wong@oracle.com>2019-09-06 00:36:12 -0400
commit8ab39f11d97464bd0f27443f45e43096814d3ccb (patch)
tree4daaffbf3d62c4287f272b7e25e4e3e628a6035a
parentcdea5459ce263fbc963657a7736762ae897a8ae6 (diff)
xfs: prevent CIL push holdoff in log recovery
generic/530 on a machine with enough ram and a non-preemptible kernel can run the AGI processing phase of log recovery enitrely out of cache. This means it never blocks on locks, never waits for IO and runs entirely through the unlinked lists until it either completes or blocks and hangs because it has run out of log space. It runs out of log space because the background CIL push is scheduled but never runs. queue_work() queues the CIL work on the current CPU that is busy, and the workqueue code will not run it on any other CPU. Hence if the unlinked list processing never yields the CPU voluntarily, the push work is delayed indefinitely. This results in the CIL aggregating changes until all the log space is consumed. When the log recoveyr processing evenutally blocks, the CIL flushes but because the last iclog isn't submitted for IO because it isn't full, the CIL flush never completes and nothing ever moves the log head forwards, or indeed inserts anything into the tail of the log, and hence nothing is able to get the log moving again and recovery hangs. There are several problems here, but the two obvious ones from the trace are that: a) log recovery does not yield the CPU for over 4 seconds, b) binding CIL pushes to a single CPU is a really bad idea. This patch addresses just these two aspects of the problem, and are suitable for backporting to work around any issues in older kernels. The more fundamental problem of preventing the CIL from consuming more than 50% of the log without committing will take more invasive and complex work, so will be done as followup work. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
-rw-r--r--fs/xfs/xfs_log_recover.c30
-rw-r--r--fs/xfs/xfs_super.c3
2 files changed, 23 insertions, 10 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index f05c6c99c4f3..508319039dce 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -5024,16 +5024,27 @@ xlog_recover_process_one_iunlink(
5024} 5024}
5025 5025
5026/* 5026/*
5027 * xlog_iunlink_recover 5027 * Recover AGI unlinked lists
5028 * 5028 *
5029 * This is called during recovery to process any inodes which 5029 * This is called during recovery to process any inodes which we unlinked but
5030 * we unlinked but not freed when the system crashed. These 5030 * not freed when the system crashed. These inodes will be on the lists in the
5031 * inodes will be on the lists in the AGI blocks. What we do 5031 * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
5032 * here is scan all the AGIs and fully truncate and free any 5032 * any inodes found on the lists. Each inode is removed from the lists when it
5033 * inodes found on the lists. Each inode is removed from the 5033 * has been fully truncated and is freed. The freeing of the inode and its
5034 * lists when it has been fully truncated and is freed. The 5034 * removal from the list must be atomic.
5035 * freeing of the inode and its removal from the list must be 5035 *
5036 * atomic. 5036 * If everything we touch in the agi processing loop is already in memory, this
5037 * loop can hold the cpu for a long time. It runs without lock contention,
5038 * memory allocation contention, the need wait for IO, etc, and so will run
5039 * until we either run out of inodes to process, run low on memory or we run out
5040 * of log space.
5041 *
5042 * This behaviour is bad for latency on single CPU and non-preemptible kernels,
5043 * and can prevent other filesytem work (such as CIL pushes) from running. This
5044 * can lead to deadlocks if the recovery process runs out of log reservation
5045 * space. Hence we need to yield the CPU when there is other kernel work
5046 * scheduled on this CPU to ensure other scheduled work can run without undue
5047 * latency.
5037 */ 5048 */
5038STATIC void 5049STATIC void
5039xlog_recover_process_iunlinks( 5050xlog_recover_process_iunlinks(
@@ -5080,6 +5091,7 @@ xlog_recover_process_iunlinks(
5080 while (agino != NULLAGINO) { 5091 while (agino != NULLAGINO) {
5081 agino = xlog_recover_process_one_iunlink(mp, 5092 agino = xlog_recover_process_one_iunlink(mp,
5082 agno, agino, bucket); 5093 agno, agino, bucket);
5094 cond_resched();
5083 } 5095 }
5084 } 5096 }
5085 xfs_buf_rele(agibp); 5097 xfs_buf_rele(agibp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f9450235533c..391b4748cae3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -818,7 +818,8 @@ xfs_init_mount_workqueues(
818 goto out_destroy_buf; 818 goto out_destroy_buf;
819 819
820 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", 820 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
821 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); 821 WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
822 0, mp->m_fsname);
822 if (!mp->m_cil_workqueue) 823 if (!mp->m_cil_workqueue)
823 goto out_destroy_unwritten; 824 goto out_destroy_unwritten;
824 825