2 files changed, 70 insertions, 4 deletions
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 53abd6b0a33..9b21f80f31c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -336,6 +336,7 @@ xfs_log_commit_cil(
 {
        struct log              *log = mp->m_log;
        int                     log_flags = 0;
+        int                     push = 0;
        if (flags & XFS_TRANS_RELEASE_LOG_RES)
                log_flags = XFS_LOG_REL_PERM_RESERV;
@@ -365,8 +366,20 @@ xfs_log_commit_cil(
        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
        xfs_trans_unreserve_and_mod_sb(tp);
-        /* background commit is allowed again */
+        /* check for background commit before unlock */
+        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+                push = 1;
        up_read(&log->l_cilp->xc_ctx_lock);
+        /*
+         * We need to push CIL every so often so we don't cache more than we
+         * can fit in the log. The limit really is that a checkpoint can't be
+         * more than half the log (the current checkpoint is not allowed to
+         * overwrite the previous checkpoint), but commit latency and memory
+         * usage limit this to a smaller size in most cases.
+         */
+        if (push)
+                xlog_cil_push(log, 0);
        return 0;
 }
@@ -429,18 +442,25 @@ xlog_cil_push(
        if (!cil)
                return 0;
-        /* XXX: don't sleep for background? */
        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
        new_ctx->ticket = xlog_cil_ticket_alloc(log);
-        /* lock out transaction commit */
+        /* lock out transaction commit, but don't block on background push */
-        down_write(&cil->xc_ctx_lock);
+        if (!down_write_trylock(&cil->xc_ctx_lock)) {
+                if (!push_now)
+                        goto out_free_ticket;
+                down_write(&cil->xc_ctx_lock);
+        }
        ctx = cil->xc_ctx;
        /* check if we've anything to push */
        if (list_empty(&cil->xc_cil))
                goto out_skip;
+        /* check for spurious background flush */
+        if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+                goto out_skip;
        /*
         * pull all the log vectors off the items in the CIL, and
         * remove the items from the CIL. We don't need the CIL lock
@@ -584,6 +604,7 @@ restart:
 out_skip:
        up_write(&cil->xc_ctx_lock);
+out_free_ticket:
        xfs_log_ticket_put(new_ctx->ticket);
        kmem_free(new_ctx);
        return 0;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 48d920891b9..8c072618965 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -425,6 +425,51 @@ struct xfs_cil {
 };
 /*
+ * The amount of log space we should the CIL to aggregate is difficult to size.
+ * Whatever we chose we have to make we can get a reservation for the log space
+ * effectively, that it is large enough to capture sufficient relogging to
+ * reduce log buffer IO significantly, but it is not too large for the log or
+ * induces too much latency when writing out through the iclogs. We track both
+ * space consumed and the number of vectors in the checkpoint context, so we
+ * need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can basically make up arbitrary limits for the
+ * checkpoint size so long as they don't violate any other size rules.  Hence
+ * the initial maximum size for the checkpoint transaction will be set to a
+ * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * right now based on the latency of writing out a large amount of data through
+ * the circular iclog buffers.
+ */
+#define XLOG_CIL_SPACE_LIMIT(log)       \
+        (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
+/*
 * The reservation head lsn is not made up of a cycle number and block number.
 * Instead, it uses a cycle number and byte number.  Logs don't expect to
 * overflow 31 bits worth of byte offset, so using a byte number will mean

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 53abd6b0a33..9b21f80f31c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c
@@ -336,6 +336,7 @@ xfs_log_commit_cil(
336	{	336	{
337	struct log *log = mp->m_log;	337	struct log *log = mp->m_log;
338	int log_flags = 0;	338	int log_flags = 0;
		339	int push = 0;
339		340
340	if (flags & XFS_TRANS_RELEASE_LOG_RES)	341	if (flags & XFS_TRANS_RELEASE_LOG_RES)
341	log_flags = XFS_LOG_REL_PERM_RESERV;	342	log_flags = XFS_LOG_REL_PERM_RESERV;
@@ -365,8 +366,20 @@ xfs_log_commit_cil(
365	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);	366	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
366	xfs_trans_unreserve_and_mod_sb(tp);	367	xfs_trans_unreserve_and_mod_sb(tp);
367		368
368	/* background commit is allowed again */	369	/* check for background commit before unlock */
		370	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
		371	push = 1;
369	up_read(&log->l_cilp->xc_ctx_lock);	372	up_read(&log->l_cilp->xc_ctx_lock);
		373
		374	/*
		375	* We need to push CIL every so often so we don't cache more than we
		376	* can fit in the log. The limit really is that a checkpoint can't be
		377	* more than half the log (the current checkpoint is not allowed to
		378	* overwrite the previous checkpoint), but commit latency and memory
		379	* usage limit this to a smaller size in most cases.
		380	*/
		381	if (push)
		382	xlog_cil_push(log, 0);
370	return 0;	383	return 0;
371	}	384	}
372		385
@@ -429,18 +442,25 @@ xlog_cil_push(
429	if (!cil)	442	if (!cil)
430	return 0;	443	return 0;
431		444
432	/* XXX: don't sleep for background? */
433	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP\|KM_NOFS);	445	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP\|KM_NOFS);
434	new_ctx->ticket = xlog_cil_ticket_alloc(log);	446	new_ctx->ticket = xlog_cil_ticket_alloc(log);
435		447
436	/* lock out transaction commit */	448	/* lock out transaction commit, but don't block on background push */
437	down_write(&cil->xc_ctx_lock);	449	if (!down_write_trylock(&cil->xc_ctx_lock)) {
		450	if (!push_now)
		451	goto out_free_ticket;
		452	down_write(&cil->xc_ctx_lock);
		453	}
438	ctx = cil->xc_ctx;	454	ctx = cil->xc_ctx;
439		455
440	/* check if we've anything to push */	456	/* check if we've anything to push */
441	if (list_empty(&cil->xc_cil))	457	if (list_empty(&cil->xc_cil))
442	goto out_skip;	458	goto out_skip;
443		459
		460	/* check for spurious background flush */
		461	if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
		462	goto out_skip;
		463
444	/*	464	/*
445	* pull all the log vectors off the items in the CIL, and	465	* pull all the log vectors off the items in the CIL, and
446	* remove the items from the CIL. We don't need the CIL lock	466	* remove the items from the CIL. We don't need the CIL lock
@@ -584,6 +604,7 @@ restart:
584		604
585	out_skip:	605	out_skip:
586	up_write(&cil->xc_ctx_lock);	606	up_write(&cil->xc_ctx_lock);
		607	out_free_ticket:
587	xfs_log_ticket_put(new_ctx->ticket);	608	xfs_log_ticket_put(new_ctx->ticket);
588	kmem_free(new_ctx);	609	kmem_free(new_ctx);
589	return 0;	610	return 0;


diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 48d920891b9..8c072618965 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h
@@ -425,6 +425,51 @@ struct xfs_cil {
425	};	425	};
426		426
427	/*	427	/*
		428	* The amount of log space we should the CIL to aggregate is difficult to size.
		429	* Whatever we chose we have to make we can get a reservation for the log space
		430	* effectively, that it is large enough to capture sufficient relogging to
		431	* reduce log buffer IO significantly, but it is not too large for the log or
		432	* induces too much latency when writing out through the iclogs. We track both
		433	* space consumed and the number of vectors in the checkpoint context, so we
		434	* need to decide which to use for limiting.
		435	*
		436	* Every log buffer we write out during a push needs a header reserved, which
		437	* is at least one sector and more for v2 logs. Hence we need a reservation of
		438	* at least 512 bytes per 32k of log space just for the LR headers. That means
		439	* 16KB of reservation per megabyte of delayed logging space we will consume,
		440	* plus various headers. The number of headers will vary based on the num of
		441	* io vectors, so limiting on a specific number of vectors is going to result
		442	* in transactions of varying size. IOWs, it is more consistent to track and
		443	* limit space consumed in the log rather than by the number of objects being
		444	* logged in order to prevent checkpoint ticket overruns.
		445	*
		446	* Further, use of static reservations through the log grant mechanism is
		447	* problematic. It introduces a lot of complexity (e.g. reserve grant vs write
		448	* grant) and a significant deadlock potential because regranting write space
		449	* can block on log pushes. Hence if we have to regrant log space during a log
		450	* push, we can deadlock.
		451	*
		452	* However, we can avoid this by use of a dynamic "reservation stealing"
		453	* technique during transaction commit whereby unused reservation space in the
		454	* transaction ticket is transferred to the CIL ctx commit ticket to cover the
		455	* space needed by the checkpoint transaction. This means that we never need to
		456	* specifically reserve space for the CIL checkpoint transaction, nor do we
		457	* need to regrant space once the checkpoint completes. This also means the
		458	* checkpoint transaction ticket is specific to the checkpoint context, rather
		459	* than the CIL itself.
		460	*
		461	* With dynamic reservations, we can basically make up arbitrary limits for the
		462	* checkpoint size so long as they don't violate any other size rules. Hence
		463	* the initial maximum size for the checkpoint transaction will be set to a
		464	* quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
		465	* right now based on the latency of writing out a large amount of data through
		466	* the circular iclog buffers.
		467	*/
		468
		469	#define XLOG_CIL_SPACE_LIMIT(log) \
		470	(min((log->l_logsize >> 2), (8 * 1024 * 1024)))
		471
		472	/*
428	* The reservation head lsn is not made up of a cycle number and block number.	473	* The reservation head lsn is not made up of a cycle number and block number.
429	* Instead, it uses a cycle number and byte number. Logs don't expect to	474	* Instead, it uses a cycle number and byte number. Logs don't expect to
430	* overflow 31 bits worth of byte offset, so using a byte number will mean	475	* overflow 31 bits worth of byte offset, so using a byte number will mean