aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2010-05-21 00:37:18 -0400
committerAlex Elder <aelder@sgi.com>2010-05-24 11:38:03 -0400
commit71e330b593905e40d6c5afa824d38ee02d70ce5f (patch)
tree4c9fa6c4766280752fc40f3057fd6cf64396c16c /fs/xfs
parenta9a745daadab26f13884ff26a50fa38247c11ce9 (diff)
xfs: Introduce delayed logging core code
The delayed logging code only changes in-memory structures and as such can be enabled and disabled with a mount option. Add the mount option and emit a warning that this is an experimental feature that should not be used in production yet. We also need infrastructure to track committed items that have not yet been written to the log. This is what the Committed Item List (CIL) is for. The log item also needs to be extended to track the current log vector, the associated memory buffer and it's location in the Commit Item List. Extend the log item and log vector structures to enable this tracking. To maintain the current log format for transactions with delayed logging, we need to introduce a checkpoint transaction and a context for tracking each checkpoint from initiation to transaction completion. This includes adding a log ticket for tracking space log required/used by the context checkpoint. To track all the changes we need an io vector array per log item, rather than a single array for the entire transaction. Using the new log vector structure for this requires two passes - the first to allocate the log vector structures and chain them together, and the second to fill them out. This log vector chain can then be passed to the CIL for formatting, pinning and insertion into the CIL. Formatting of the log vector chain is relatively simple - it's just a loop over the iovecs on each log vector, but it is made slightly more complex because we re-write the iovec after the copy to point back at the memory buffer we just copied into. This code also needs to pin log items. If the log item is not already tracked in this checkpoint context, then it needs to be pinned. Otherwise it is already pinned and we don't need to pin it again. The only other complexity is calculating the amount of new log space the formatting has consumed. This needs to be accounted to the transaction in progress, and the accounting is made more complex becase we need also to steal space from it for log metadata in the checkpoint transaction. Calculate all this at insert time and update all the tickets, counters, etc correctly. Once we've formatted all the log items in the transaction, attach the busy extents to the checkpoint context so the busy extents live until checkpoint completion and can be processed at that point in time. Transactions can then be freed at this point in time. Now we need to issue checkpoints - we are tracking the amount of log space used by the items in the CIL, so we can trigger background checkpoints when the space usage gets to a certain threshold. Otherwise, checkpoints need ot be triggered when a log synchronisation point is reached - a log force event. Because the log write code already handles chained log vectors, writing the transaction is trivial, too. Construct a transaction header, add it to the head of the chain and write it into the log, then issue a commit record write. Then we can release the checkpoint log ticket and attach the context to the log buffer so it can be called during Io completion to complete the checkpoint. We also need to allow for synchronising multiple in-flight checkpoints. This is needed for two things - the first is to ensure that checkpoint commit records appear in the log in the correct sequence order (so they are replayed in the correct order). The second is so that xfs_log_force_lsn() operates correctly and only flushes and/or waits for the specific sequence it was provided with. To do this we need a wait variable and a list tracking the checkpoint commits in progress. We can walk this list and wait for the checkpoints to change state or complete easily, an this provides the necessary synchronisation for correct operation in both cases. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c10
-rw-r--r--fs/xfs/xfs_log.c64
-rw-r--r--fs/xfs/xfs_log.h9
-rw-r--r--fs/xfs/xfs_log_cil.c659
-rw-r--r--fs/xfs/xfs_log_priv.h71
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_trans.c103
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_trans_item.c5
-rw-r--r--fs/xfs/xfs_trans_priv.h11
11 files changed, 912 insertions, 30 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
77 xfs_itable.o \ 77 xfs_itable.o \
78 xfs_dfrag.o \ 78 xfs_dfrag.o \
79 xfs_log.o \ 79 xfs_log.o \
80 xfs_log_cil.o \
80 xfs_log_recover.o \ 81 xfs_log_recover.o \
81 xfs_mount.o \ 82 xfs_mount.o \
82 xfs_mru_cache.o \ 83 xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a8ea03afe2e3..775de2b5727c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ 119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ 120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ 121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
122#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
123#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
122 124
123/* 125/*
124 * Table driven mount option parser. 126 * Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
374 mp->m_flags |= XFS_MOUNT_DMAPI; 376 mp->m_flags |= XFS_MOUNT_DMAPI;
375 } else if (!strcmp(this_char, MNTOPT_DMI)) { 377 } else if (!strcmp(this_char, MNTOPT_DMI)) {
376 mp->m_flags |= XFS_MOUNT_DMAPI; 378 mp->m_flags |= XFS_MOUNT_DMAPI;
379 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
380 mp->m_flags |= XFS_MOUNT_DELAYLOG;
381 cmn_err(CE_WARN,
382 "Enabling EXPERIMENTAL delayed logging feature "
383 "- use at your own risk.\n");
384 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
385 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
377 } else if (!strcmp(this_char, "ihashsize")) { 386 } else if (!strcmp(this_char, "ihashsize")) {
378 cmn_err(CE_WARN, 387 cmn_err(CE_WARN,
379 "XFS: ihashsize no longer used, option is deprecated."); 388 "XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
535 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 544 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
536 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, 545 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
537 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 546 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
547 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
538 { 0, NULL } 548 { 0, NULL }
539 }; 549 };
540 static struct proc_xfs_info xfs_info_unset[] = { 550 static struct proc_xfs_info xfs_info_unset[] = {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 19d0c5f73e24..027ebfe20677 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -54,9 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
56STATIC void xlog_dealloc_log(xlog_t *log); 56STATIC void xlog_dealloc_log(xlog_t *log);
57STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
58 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
59 xlog_in_core_t **commit_iclog, uint flags);
60 57
61/* local state machine functions */ 58/* local state machine functions */
62STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 59STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -86,12 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log,
86STATIC void xlog_ungrant_log_space(xlog_t *log, 83STATIC void xlog_ungrant_log_space(xlog_t *log,
87 xlog_ticket_t *ticket); 84 xlog_ticket_t *ticket);
88 85
89
90/* local ticket functions */
91STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log, int unit_bytes, int count,
92 char clientid, uint flags,
93 int alloc_flags);
94
95#if defined(DEBUG) 86#if defined(DEBUG)
96STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 87STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
97STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 88STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
@@ -460,6 +451,13 @@ xfs_log_mount(
460 /* Normal transactions can now occur */ 451 /* Normal transactions can now occur */
461 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 452 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
462 453
454 /*
455 * Now the log has been fully initialised and we know were our
456 * space grant counters are, we can initialise the permanent ticket
457 * needed for delayed logging to work.
458 */
459 xlog_cil_init_post_recovery(mp->m_log);
460
463 return 0; 461 return 0;
464 462
465out_destroy_ail: 463out_destroy_ail:
@@ -666,6 +664,10 @@ xfs_log_item_init(
666 item->li_ailp = mp->m_ail; 664 item->li_ailp = mp->m_ail;
667 item->li_type = type; 665 item->li_type = type;
668 item->li_ops = ops; 666 item->li_ops = ops;
667 item->li_lv = NULL;
668
669 INIT_LIST_HEAD(&item->li_ail);
670 INIT_LIST_HEAD(&item->li_cil);
669} 671}
670 672
671/* 673/*
@@ -1176,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1176 *iclogp = log->l_iclog; /* complete ring */ 1178 *iclogp = log->l_iclog; /* complete ring */
1177 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1179 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1178 1180
1181 error = xlog_cil_init(log);
1182 if (error)
1183 goto out_free_iclog;
1179 return log; 1184 return log;
1180 1185
1181out_free_iclog: 1186out_free_iclog:
@@ -1502,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
1502 xlog_in_core_t *iclog, *next_iclog; 1507 xlog_in_core_t *iclog, *next_iclog;
1503 int i; 1508 int i;
1504 1509
1510 xlog_cil_destroy(log);
1511
1505 iclog = log->l_iclog; 1512 iclog = log->l_iclog;
1506 for (i=0; i<log->l_iclog_bufs; i++) { 1513 for (i=0; i<log->l_iclog_bufs; i++) {
1507 sv_destroy(&iclog->ic_force_wait); 1514 sv_destroy(&iclog->ic_force_wait);
@@ -1544,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log,
1544 * print out info relating to regions written which consume 1551 * print out info relating to regions written which consume
1545 * the reservation 1552 * the reservation
1546 */ 1553 */
1547STATIC void 1554void
1548xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1555xlog_print_tic_res(
1556 struct xfs_mount *mp,
1557 struct xlog_ticket *ticket)
1549{ 1558{
1550 uint i; 1559 uint i;
1551 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1560 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1877,7 +1886,7 @@ xlog_write_copy_finish(
1877 * we don't update ic_offset until the end when we know exactly how many 1886 * we don't update ic_offset until the end when we know exactly how many
1878 * bytes have been written out. 1887 * bytes have been written out.
1879 */ 1888 */
1880STATIC int 1889int
1881xlog_write( 1890xlog_write(
1882 struct log *log, 1891 struct log *log,
1883 struct xfs_log_vec *log_vector, 1892 struct xfs_log_vec *log_vector,
@@ -1901,9 +1910,26 @@ xlog_write(
1901 *start_lsn = 0; 1910 *start_lsn = 0;
1902 1911
1903 len = xlog_write_calc_vec_length(ticket, log_vector); 1912 len = xlog_write_calc_vec_length(ticket, log_vector);
1904 if (ticket->t_curr_res < len) 1913 if (log->l_cilp) {
1914 /*
1915 * Region headers and bytes are already accounted for.
1916 * We only need to take into account start records and
1917 * split regions in this function.
1918 */
1919 if (ticket->t_flags & XLOG_TIC_INITED)
1920 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1921
1922 /*
1923 * Commit record headers need to be accounted for. These
1924 * come in as separate writes so are easy to detect.
1925 */
1926 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
1927 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1928 } else
1929 ticket->t_curr_res -= len;
1930
1931 if (ticket->t_curr_res < 0)
1905 xlog_print_tic_res(log->l_mp, ticket); 1932 xlog_print_tic_res(log->l_mp, ticket);
1906 ticket->t_curr_res -= len;
1907 1933
1908 index = 0; 1934 index = 0;
1909 lv = log_vector; 1935 lv = log_vector;
@@ -2999,6 +3025,8 @@ _xfs_log_force(
2999 3025
3000 XFS_STATS_INC(xs_log_force); 3026 XFS_STATS_INC(xs_log_force);
3001 3027
3028 xlog_cil_push(log, 1);
3029
3002 spin_lock(&log->l_icloglock); 3030 spin_lock(&log->l_icloglock);
3003 3031
3004 iclog = log->l_iclog; 3032 iclog = log->l_iclog;
@@ -3148,6 +3176,12 @@ _xfs_log_force_lsn(
3148 3176
3149 XFS_STATS_INC(xs_log_force); 3177 XFS_STATS_INC(xs_log_force);
3150 3178
3179 if (log->l_cilp) {
3180 lsn = xlog_cil_push_lsn(log, lsn);
3181 if (lsn == NULLCOMMITLSN)
3182 return 0;
3183 }
3184
3151try_again: 3185try_again:
3152 spin_lock(&log->l_icloglock); 3186 spin_lock(&log->l_icloglock);
3153 iclog = log->l_iclog; 3187 iclog = log->l_iclog;
@@ -3322,7 +3356,7 @@ xfs_log_get_trans_ident(
3322/* 3356/*
3323 * Allocate and initialise a new log ticket. 3357 * Allocate and initialise a new log ticket.
3324 */ 3358 */
3325STATIC xlog_ticket_t * 3359xlog_ticket_t *
3326xlog_ticket_alloc( 3360xlog_ticket_alloc(
3327 struct log *log, 3361 struct log *log,
3328 int unit_bytes, 3362 int unit_bytes,
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 05f205aac913..4a0c57432e8f 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -113,6 +113,9 @@ struct xfs_log_vec {
113 struct xfs_log_vec *lv_next; /* next lv in build list */ 113 struct xfs_log_vec *lv_next; /* next lv in build list */
114 int lv_niovecs; /* number of iovecs in lv */ 114 int lv_niovecs; /* number of iovecs in lv */
115 struct xfs_log_iovec *lv_iovecp; /* iovec array */ 115 struct xfs_log_iovec *lv_iovecp; /* iovec array */
116 struct xfs_log_item *lv_item; /* owner */
117 char *lv_buf; /* formatted buffer */
118 int lv_buf_len; /* size of formatted buffer */
116}; 119};
117 120
118/* 121/*
@@ -187,11 +190,15 @@ int xfs_log_need_covered(struct xfs_mount *mp);
187 190
188void xlog_iodone(struct xfs_buf *); 191void xlog_iodone(struct xfs_buf *);
189 192
190struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); 193struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
191void xfs_log_ticket_put(struct xlog_ticket *ticket); 194void xfs_log_ticket_put(struct xlog_ticket *ticket);
192 195
193xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); 196xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
194 197
198int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
199 struct xfs_log_vec *log_vector,
200 xfs_lsn_t *commit_lsn, int flags);
201
195#endif 202#endif
196 203
197 204
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..53abd6b0a333
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,659 @@
1/*
2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_log_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h"
32#include "xfs_error.h"
33#include "xfs_alloc.h"
34
35/*
36 * Perform initial CIL structure initialisation. If the CIL is not
37 * enabled in this filesystem, ensure the log->l_cilp is null so
38 * we can check this conditional to determine if we are doing delayed
39 * logging or not.
40 */
41int
42xlog_cil_init(
43 struct log *log)
44{
45 struct xfs_cil *cil;
46 struct xfs_cil_ctx *ctx;
47
48 log->l_cilp = NULL;
49 if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
50 return 0;
51
52 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
53 if (!cil)
54 return ENOMEM;
55
56 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
57 if (!ctx) {
58 kmem_free(cil);
59 return ENOMEM;
60 }
61
62 INIT_LIST_HEAD(&cil->xc_cil);
63 INIT_LIST_HEAD(&cil->xc_committing);
64 spin_lock_init(&cil->xc_cil_lock);
65 init_rwsem(&cil->xc_ctx_lock);
66 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
67
68 INIT_LIST_HEAD(&ctx->committing);
69 INIT_LIST_HEAD(&ctx->busy_extents);
70 ctx->sequence = 1;
71 ctx->cil = cil;
72 cil->xc_ctx = ctx;
73
74 cil->xc_log = log;
75 log->l_cilp = cil;
76 return 0;
77}
78
79void
80xlog_cil_destroy(
81 struct log *log)
82{
83 if (!log->l_cilp)
84 return;
85
86 if (log->l_cilp->xc_ctx) {
87 if (log->l_cilp->xc_ctx->ticket)
88 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
89 kmem_free(log->l_cilp->xc_ctx);
90 }
91
92 ASSERT(list_empty(&log->l_cilp->xc_cil));
93 kmem_free(log->l_cilp);
94}
95
96/*
97 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
98 * recover, so we don't allow failure here. Also, we allocate in a context that
99 * we don't want to be issuing transactions from, so we need to tell the
100 * allocation code this as well.
101 *
102 * We don't reserve any space for the ticket - we are going to steal whatever
103 * space we require from transactions as they commit. To ensure we reserve all
104 * the space required, we need to set the current reservation of the ticket to
105 * zero so that we know to steal the initial transaction overhead from the
106 * first transaction commit.
107 */
108static struct xlog_ticket *
109xlog_cil_ticket_alloc(
110 struct log *log)
111{
112 struct xlog_ticket *tic;
113
114 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
115 KM_SLEEP|KM_NOFS);
116 tic->t_trans_type = XFS_TRANS_CHECKPOINT;
117
118 /*
119 * set the current reservation to zero so we know to steal the basic
120 * transaction overhead reservation from the first transaction commit.
121 */
122 tic->t_curr_res = 0;
123 return tic;
124}
125
126/*
127 * After the first stage of log recovery is done, we know where the head and
128 * tail of the log are. We need this log initialisation done before we can
129 * initialise the first CIL checkpoint context.
130 *
131 * Here we allocate a log ticket to track space usage during a CIL push. This
132 * ticket is passed to xlog_write() directly so that we don't slowly leak log
133 * space by failing to account for space used by log headers and additional
134 * region headers for split regions.
135 */
136void
137xlog_cil_init_post_recovery(
138 struct log *log)
139{
140 if (!log->l_cilp)
141 return;
142
143 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
144 log->l_cilp->xc_ctx->sequence = 1;
145 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
146 log->l_curr_block);
147}
148
149/*
150 * Insert the log item into the CIL and calculate the difference in space
151 * consumed by the item. Add the space to the checkpoint ticket and calculate
152 * if the change requires additional log metadata. If it does, take that space
153 * as well. Remove the amount of space we addded to the checkpoint ticket from
154 * the current transaction ticket so that the accounting works out correctly.
155 *
156 * If this is the first time the item is being placed into the CIL in this
157 * context, pin it so it can't be written to disk until the CIL is flushed to
158 * the iclog and the iclog written to disk.
159 */
160static void
161xlog_cil_insert(
162 struct log *log,
163 struct xlog_ticket *ticket,
164 struct xfs_log_item *item,
165 struct xfs_log_vec *lv)
166{
167 struct xfs_cil *cil = log->l_cilp;
168 struct xfs_log_vec *old = lv->lv_item->li_lv;
169 struct xfs_cil_ctx *ctx = cil->xc_ctx;
170 int len;
171 int diff_iovecs;
172 int iclog_space;
173
174 if (old) {
175 /* existing lv on log item, space used is a delta */
176 ASSERT(!list_empty(&item->li_cil));
177 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
178
179 len = lv->lv_buf_len - old->lv_buf_len;
180 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
181 kmem_free(old->lv_buf);
182 kmem_free(old);
183 } else {
184 /* new lv, must pin the log item */
185 ASSERT(!lv->lv_item->li_lv);
186 ASSERT(list_empty(&item->li_cil));
187
188 len = lv->lv_buf_len;
189 diff_iovecs = lv->lv_niovecs;
190 IOP_PIN(lv->lv_item);
191
192 }
193 len += diff_iovecs * sizeof(xlog_op_header_t);
194
195 /* attach new log vector to log item */
196 lv->lv_item->li_lv = lv;
197
198 spin_lock(&cil->xc_cil_lock);
199 list_move_tail(&item->li_cil, &cil->xc_cil);
200 ctx->nvecs += diff_iovecs;
201
202 /*
203 * Now transfer enough transaction reservation to the context ticket
204 * for the checkpoint. The context ticket is special - the unit
205 * reservation has to grow as well as the current reservation as we
206 * steal from tickets so we can correctly determine the space used
207 * during the transaction commit.
208 */
209 if (ctx->ticket->t_curr_res == 0) {
210 /* first commit in checkpoint, steal the header reservation */
211 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
212 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
213 ticket->t_curr_res -= ctx->ticket->t_unit_res;
214 }
215
216 /* do we need space for more log record headers? */
217 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
218 if (len > 0 && (ctx->space_used / iclog_space !=
219 (ctx->space_used + len) / iclog_space)) {
220 int hdrs;
221
222 hdrs = (len + iclog_space - 1) / iclog_space;
223 /* need to take into account split region headers, too */
224 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
225 ctx->ticket->t_unit_res += hdrs;
226 ctx->ticket->t_curr_res += hdrs;
227 ticket->t_curr_res -= hdrs;
228 ASSERT(ticket->t_curr_res >= len);
229 }
230 ticket->t_curr_res -= len;
231 ctx->space_used += len;
232
233 spin_unlock(&cil->xc_cil_lock);
234}
235
236/*
237 * Format log item into a flat buffers
238 *
239 * For delayed logging, we need to hold a formatted buffer containing all the
240 * changes on the log item. This enables us to relog the item in memory and
241 * write it out asynchronously without needing to relock the object that was
242 * modified at the time it gets written into the iclog.
243 *
244 * This function builds a vector for the changes in each log item in the
245 * transaction. It then works out the length of the buffer needed for each log
246 * item, allocates them and formats the vector for the item into the buffer.
247 * The buffer is then attached to the log item are then inserted into the
248 * Committed Item List for tracking until the next checkpoint is written out.
249 *
250 * We don't set up region headers during this process; we simply copy the
251 * regions into the flat buffer. We can do this because we still have to do a
252 * formatting step to write the regions into the iclog buffer. Writing the
253 * ophdrs during the iclog write means that we can support splitting large
254 * regions across iclog boundares without needing a change in the format of the
255 * item/region encapsulation.
256 *
257 * Hence what we need to do now is change the rewrite the vector array to point
258 * to the copied region inside the buffer we just allocated. This allows us to
259 * format the regions into the iclog as though they are being formatted
260 * directly out of the objects themselves.
261 */
262static void
263xlog_cil_format_items(
264 struct log *log,
265 struct xfs_log_vec *log_vector,
266 struct xlog_ticket *ticket,
267 xfs_lsn_t *start_lsn)
268{
269 struct xfs_log_vec *lv;
270
271 if (start_lsn)
272 *start_lsn = log->l_cilp->xc_ctx->sequence;
273
274 ASSERT(log_vector);
275 for (lv = log_vector; lv; lv = lv->lv_next) {
276 void *ptr;
277 int index;
278 int len = 0;
279
280 /* build the vector array and calculate it's length */
281 IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
282 for (index = 0; index < lv->lv_niovecs; index++)
283 len += lv->lv_iovecp[index].i_len;
284
285 lv->lv_buf_len = len;
286 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
287 ptr = lv->lv_buf;
288
289 for (index = 0; index < lv->lv_niovecs; index++) {
290 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
291
292 memcpy(ptr, vec->i_addr, vec->i_len);
293 vec->i_addr = ptr;
294 ptr += vec->i_len;
295 }
296 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
297
298 xlog_cil_insert(log, ticket, lv->lv_item, lv);
299 }
300}
301
302static void
303xlog_cil_free_logvec(
304 struct xfs_log_vec *log_vector)
305{
306 struct xfs_log_vec *lv;
307
308 for (lv = log_vector; lv; ) {
309 struct xfs_log_vec *next = lv->lv_next;
310 kmem_free(lv->lv_buf);
311 kmem_free(lv);
312 lv = next;
313 }
314}
315
316/*
317 * Commit a transaction with the given vector to the Committed Item List.
318 *
319 * To do this, we need to format the item, pin it in memory if required and
320 * account for the space used by the transaction. Once we have done that we
321 * need to release the unused reservation for the transaction, attach the
322 * transaction to the checkpoint context so we carry the busy extents through
323 * to checkpoint completion, and then unlock all the items in the transaction.
324 *
325 * For more specific information about the order of operations in
326 * xfs_log_commit_cil() please refer to the comments in
327 * xfs_trans_commit_iclog().
328 */
329int
330xfs_log_commit_cil(
331 struct xfs_mount *mp,
332 struct xfs_trans *tp,
333 struct xfs_log_vec *log_vector,
334 xfs_lsn_t *commit_lsn,
335 int flags)
336{
337 struct log *log = mp->m_log;
338 int log_flags = 0;
339
340 if (flags & XFS_TRANS_RELEASE_LOG_RES)
341 log_flags = XFS_LOG_REL_PERM_RESERV;
342
343 if (XLOG_FORCED_SHUTDOWN(log)) {
344 xlog_cil_free_logvec(log_vector);
345 return XFS_ERROR(EIO);
346 }
347
348 /* lock out background commit */
349 down_read(&log->l_cilp->xc_ctx_lock);
350 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
351
352 /* check we didn't blow the reservation */
353 if (tp->t_ticket->t_curr_res < 0)
354 xlog_print_tic_res(log->l_mp, tp->t_ticket);
355
356 /* attach the transaction to the CIL if it has any busy extents */
357 if (!list_empty(&tp->t_busy)) {
358 spin_lock(&log->l_cilp->xc_cil_lock);
359 list_splice_init(&tp->t_busy,
360 &log->l_cilp->xc_ctx->busy_extents);
361 spin_unlock(&log->l_cilp->xc_cil_lock);
362 }
363
364 tp->t_commit_lsn = *commit_lsn;
365 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
366 xfs_trans_unreserve_and_mod_sb(tp);
367
368 /* background commit is allowed again */
369 up_read(&log->l_cilp->xc_ctx_lock);
370 return 0;
371}
372
373/*
374 * Mark all items committed and clear busy extents. We free the log vector
375 * chains in a separate pass so that we unpin the log items as quickly as
376 * possible.
377 */
378static void
379xlog_cil_committed(
380 void *args,
381 int abort)
382{
383 struct xfs_cil_ctx *ctx = args;
384 struct xfs_log_vec *lv;
385 int abortflag = abort ? XFS_LI_ABORTED : 0;
386 struct xfs_busy_extent *busyp, *n;
387
388 /* unpin all the log items */
389 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
390 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
391 abortflag);
392 }
393
394 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
395 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
396
397 spin_lock(&ctx->cil->xc_cil_lock);
398 list_del(&ctx->committing);
399 spin_unlock(&ctx->cil->xc_cil_lock);
400
401 xlog_cil_free_logvec(ctx->lv_chain);
402 kmem_free(ctx);
403}
404
405/*
406 * Push the Committed Item List to the log. If the push_now flag is not set,
407 * then it is a background flush and so we can chose to ignore it.
408 */
409int
410xlog_cil_push(
411 struct log *log,
412 int push_now)
413{
414 struct xfs_cil *cil = log->l_cilp;
415 struct xfs_log_vec *lv;
416 struct xfs_cil_ctx *ctx;
417 struct xfs_cil_ctx *new_ctx;
418 struct xlog_in_core *commit_iclog;
419 struct xlog_ticket *tic;
420 int num_lv;
421 int num_iovecs;
422 int len;
423 int error = 0;
424 struct xfs_trans_header thdr;
425 struct xfs_log_iovec lhdr;
426 struct xfs_log_vec lvhdr = { NULL };
427 xfs_lsn_t commit_lsn;
428
429 if (!cil)
430 return 0;
431
432 /* XXX: don't sleep for background? */
433 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
434 new_ctx->ticket = xlog_cil_ticket_alloc(log);
435
436 /* lock out transaction commit */
437 down_write(&cil->xc_ctx_lock);
438 ctx = cil->xc_ctx;
439
440 /* check if we've anything to push */
441 if (list_empty(&cil->xc_cil))
442 goto out_skip;
443
444 /*
445 * pull all the log vectors off the items in the CIL, and
446 * remove the items from the CIL. We don't need the CIL lock
447 * here because it's only needed on the transaction commit
448 * side which is currently locked out by the flush lock.
449 */
450 lv = NULL;
451 num_lv = 0;
452 num_iovecs = 0;
453 len = 0;
454 while (!list_empty(&cil->xc_cil)) {
455 struct xfs_log_item *item;
456 int i;
457
458 item = list_first_entry(&cil->xc_cil,
459 struct xfs_log_item, li_cil);
460 list_del_init(&item->li_cil);
461 if (!ctx->lv_chain)
462 ctx->lv_chain = item->li_lv;
463 else
464 lv->lv_next = item->li_lv;
465 lv = item->li_lv;
466 item->li_lv = NULL;
467
468 num_lv++;
469 num_iovecs += lv->lv_niovecs;
470 for (i = 0; i < lv->lv_niovecs; i++)
471 len += lv->lv_iovecp[i].i_len;
472 }
473
474 /*
475 * initialise the new context and attach it to the CIL. Then attach
476 * the current context to the CIL committing lsit so it can be found
477 * during log forces to extract the commit lsn of the sequence that
478 * needs to be forced.
479 */
480 INIT_LIST_HEAD(&new_ctx->committing);
481 INIT_LIST_HEAD(&new_ctx->busy_extents);
482 new_ctx->sequence = ctx->sequence + 1;
483 new_ctx->cil = cil;
484 cil->xc_ctx = new_ctx;
485
486 /*
487 * The switch is now done, so we can drop the context lock and move out
488 * of a shared context. We can't just go straight to the commit record,
489 * though - we need to synchronise with previous and future commits so
490 * that the commit records are correctly ordered in the log to ensure
491 * that we process items during log IO completion in the correct order.
492 *
493 * For example, if we get an EFI in one checkpoint and the EFD in the
494 * next (e.g. due to log forces), we do not want the checkpoint with
495 * the EFD to be committed before the checkpoint with the EFI. Hence
496 * we must strictly order the commit records of the checkpoints so
497 * that: a) the checkpoint callbacks are attached to the iclogs in the
498 * correct order; and b) the checkpoints are replayed in correct order
499 * in log recovery.
500 *
501 * Hence we need to add this context to the committing context list so
502 * that higher sequences will wait for us to write out a commit record
503 * before they do.
504 */
505 spin_lock(&cil->xc_cil_lock);
506 list_add(&ctx->committing, &cil->xc_committing);
507 spin_unlock(&cil->xc_cil_lock);
508 up_write(&cil->xc_ctx_lock);
509
510 /*
511 * Build a checkpoint transaction header and write it to the log to
512 * begin the transaction. We need to account for the space used by the
513 * transaction header here as it is not accounted for in xlog_write().
514 *
515 * The LSN we need to pass to the log items on transaction commit is
516 * the LSN reported by the first log vector write. If we use the commit
517 * record lsn then we can move the tail beyond the grant write head.
518 */
519 tic = ctx->ticket;
520 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
521 thdr.th_type = XFS_TRANS_CHECKPOINT;
522 thdr.th_tid = tic->t_tid;
523 thdr.th_num_items = num_iovecs;
524 lhdr.i_addr = (xfs_caddr_t)&thdr;
525 lhdr.i_len = sizeof(xfs_trans_header_t);
526 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
527 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
528
529 lvhdr.lv_niovecs = 1;
530 lvhdr.lv_iovecp = &lhdr;
531 lvhdr.lv_next = ctx->lv_chain;
532
533 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
534 if (error)
535 goto out_abort;
536
537 /*
538 * now that we've written the checkpoint into the log, strictly
539 * order the commit records so replay will get them in the right order.
540 */
541restart:
542 spin_lock(&cil->xc_cil_lock);
543 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
544 /*
545 * Higher sequences will wait for this one so skip them.
546 * Don't wait for own own sequence, either.
547 */
548 if (new_ctx->sequence >= ctx->sequence)
549 continue;
550 if (!new_ctx->commit_lsn) {
551 /*
552 * It is still being pushed! Wait for the push to
553 * complete, then start again from the beginning.
554 */
555 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
556 goto restart;
557 }
558 }
559 spin_unlock(&cil->xc_cil_lock);
560
561 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
562 if (error || commit_lsn == -1)
563 goto out_abort;
564
565 /* attach all the transactions w/ busy extents to iclog */
566 ctx->log_cb.cb_func = xlog_cil_committed;
567 ctx->log_cb.cb_arg = ctx;
568 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
569 if (error)
570 goto out_abort;
571
572 /*
573 * now the checkpoint commit is complete and we've attached the
574 * callbacks to the iclog we can assign the commit LSN to the context
575 * and wake up anyone who is waiting for the commit to complete.
576 */
577 spin_lock(&cil->xc_cil_lock);
578 ctx->commit_lsn = commit_lsn;
579 sv_broadcast(&cil->xc_commit_wait);
580 spin_unlock(&cil->xc_cil_lock);
581
582 /* release the hounds! */
583 return xfs_log_release_iclog(log->l_mp, commit_iclog);
584
585out_skip:
586 up_write(&cil->xc_ctx_lock);
587 xfs_log_ticket_put(new_ctx->ticket);
588 kmem_free(new_ctx);
589 return 0;
590
591out_abort:
592 xlog_cil_committed(ctx, XFS_LI_ABORTED);
593 return XFS_ERROR(EIO);
594}
595
596/*
597 * Conditionally push the CIL based on the sequence passed in.
598 *
599 * We only need to push if we haven't already pushed the sequence
600 * number given. Hence the only time we will trigger a push here is
601 * if the push sequence is the same as the current context.
602 *
603 * We return the current commit lsn to allow the callers to determine if a
604 * iclog flush is necessary following this call.
605 *
606 * XXX: Initially, just push the CIL unconditionally and return whatever
607 * commit lsn is there. It'll be empty, so this is broken for now.
608 */
609xfs_lsn_t
610xlog_cil_push_lsn(
611 struct log *log,
612 xfs_lsn_t push_seq)
613{
614 struct xfs_cil *cil = log->l_cilp;
615 struct xfs_cil_ctx *ctx;
616 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
617
618restart:
619 down_write(&cil->xc_ctx_lock);
620 ASSERT(push_seq <= cil->xc_ctx->sequence);
621
622 /* check to see if we need to force out the current context */
623 if (push_seq == cil->xc_ctx->sequence) {
624 up_write(&cil->xc_ctx_lock);
625 xlog_cil_push(log, 1);
626 goto restart;
627 }
628
629 /*
630 * See if we can find a previous sequence still committing.
631 * We can drop the flush lock as soon as we have the cil lock
632 * because we are now only comparing contexts protected by
633 * the cil lock.
634 *
635 * We need to wait for all previous sequence commits to complete
636 * before allowing the force of push_seq to go ahead. Hence block
637 * on commits for those as well.
638 */
639 spin_lock(&cil->xc_cil_lock);
640 up_write(&cil->xc_ctx_lock);
641 list_for_each_entry(ctx, &cil->xc_committing, committing) {
642 if (ctx->sequence > push_seq)
643 continue;
644 if (!ctx->commit_lsn) {
645 /*
646 * It is still being pushed! Wait for the push to
647 * complete, then start again from the beginning.
648 */
649 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
650 goto restart;
651 }
652 if (ctx->sequence != push_seq)
653 continue;
654 /* found it! */
655 commit_lsn = ctx->commit_lsn;
656 }
657 spin_unlock(&cil->xc_cil_lock);
658 return commit_lsn;
659}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ac97bddcadba..48d920891b94 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -377,6 +377,54 @@ typedef struct xlog_in_core {
377} xlog_in_core_t; 377} xlog_in_core_t;
378 378
379/* 379/*
380 * The CIL context is used to aggregate per-transaction details as well be
381 * passed to the iclog for checkpoint post-commit processing. After being
382 * passed to the iclog, another context needs to be allocated for tracking the
383 * next set of transactions to be aggregated into a checkpoint.
384 */
385struct xfs_cil;
386
387struct xfs_cil_ctx {
388 struct xfs_cil *cil;
389 xfs_lsn_t sequence; /* chkpt sequence # */
390 xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
391 xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
392 struct xlog_ticket *ticket; /* chkpt ticket */
393 int nvecs; /* number of regions */
394 int space_used; /* aggregate size of regions */
395 struct list_head busy_extents; /* busy extents in chkpt */
396 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
397 xfs_log_callback_t log_cb; /* completion callback hook. */
398 struct list_head committing; /* ctx committing list */
399};
400
401/*
402 * Committed Item List structure
403 *
404 * This structure is used to track log items that have been committed but not
405 * yet written into the log. It is used only when the delayed logging mount
406 * option is enabled.
407 *
408 * This structure tracks the list of committing checkpoint contexts so
409 * we can avoid the problem of having to hold out new transactions during a
410 * flush until we have a the commit record LSN of the checkpoint. We can
411 * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
412 * sequence match and extract the commit LSN directly from there. If the
413 * checkpoint is still in the process of committing, we can block waiting for
414 * the commit LSN to be determined as well. This should make synchronous
415 * operations almost as efficient as the old logging methods.
416 */
417struct xfs_cil {
418 struct log *xc_log;
419 struct list_head xc_cil;
420 spinlock_t xc_cil_lock;
421 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing;
424 sv_t xc_commit_wait;
425};
426
427/*
380 * The reservation head lsn is not made up of a cycle number and block number. 428 * The reservation head lsn is not made up of a cycle number and block number.
381 * Instead, it uses a cycle number and byte number. Logs don't expect to 429 * Instead, it uses a cycle number and byte number. Logs don't expect to
382 * overflow 31 bits worth of byte offset, so using a byte number will mean 430 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -386,6 +434,7 @@ typedef struct log {
386 /* The following fields don't need locking */ 434 /* The following fields don't need locking */
387 struct xfs_mount *l_mp; /* mount point */ 435 struct xfs_mount *l_mp; /* mount point */
388 struct xfs_ail *l_ailp; /* AIL log is working with */ 436 struct xfs_ail *l_ailp; /* AIL log is working with */
437 struct xfs_cil *l_cilp; /* CIL log is working with */
389 struct xfs_buf *l_xbuf; /* extra buffer for log 438 struct xfs_buf *l_xbuf; /* extra buffer for log
390 * wrapping */ 439 * wrapping */
391 struct xfs_buftarg *l_targ; /* buftarg of log */ 440 struct xfs_buftarg *l_targ; /* buftarg of log */
@@ -436,14 +485,17 @@ typedef struct log {
436 485
437#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 486#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
438 487
439
440/* common routines */ 488/* common routines */
441extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 489extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
442extern int xlog_recover(xlog_t *log); 490extern int xlog_recover(xlog_t *log);
443extern int xlog_recover_finish(xlog_t *log); 491extern int xlog_recover_finish(xlog_t *log);
444extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 492extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
445 493
446extern kmem_zone_t *xfs_log_ticket_zone; 494extern kmem_zone_t *xfs_log_ticket_zone;
495struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
496 int count, char client, uint xflags,
497 int alloc_flags);
498
447 499
448static inline void 500static inline void
449xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) 501xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
@@ -453,6 +505,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
453 *off += bytes; 505 *off += bytes;
454} 506}
455 507
508void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
509int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
510 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
511 xlog_in_core_t **commit_iclog, uint flags);
512
513/*
514 * Committed Item List interfaces
515 */
516int xlog_cil_init(struct log *log);
517void xlog_cil_init_post_recovery(struct log *log);
518void xlog_cil_destroy(struct log *log);
519
520int xlog_cil_push(struct log *log, int push_now);
521xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
522
456/* 523/*
457 * Unmount record type is used as a pseudo transaction type for the ticket. 524 * Unmount record type is used as a pseudo transaction type for the ticket.
458 * It's value must be outside the range of XFS_TRANS_* values. 525 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
269 must be synchronous except 269 must be synchronous except
270 for space allocations */ 270 for space allocations */
271#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
271#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 272#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
272#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 273#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
273#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 274#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 40d9595a8de2..ce558efa2ea0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -655,7 +655,7 @@ xfs_trans_apply_sb_deltas(
655 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we 655 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
656 * still need to update the incore superblock with the changes. 656 * still need to update the incore superblock with the changes.
657 */ 657 */
658STATIC void 658void
659xfs_trans_unreserve_and_mod_sb( 659xfs_trans_unreserve_and_mod_sb(
660 xfs_trans_t *tp) 660 xfs_trans_t *tp)
661{ 661{
@@ -883,7 +883,7 @@ xfs_trans_fill_vecs(
883 * they could be immediately flushed and we'd have to race with the flusher 883 * they could be immediately flushed and we'd have to race with the flusher
884 * trying to pull the item from the AIL as we add it. 884 * trying to pull the item from the AIL as we add it.
885 */ 885 */
886static void 886void
887xfs_trans_item_committed( 887xfs_trans_item_committed(
888 struct xfs_log_item *lip, 888 struct xfs_log_item *lip,
889 xfs_lsn_t commit_lsn, 889 xfs_lsn_t commit_lsn,
@@ -994,7 +994,7 @@ xfs_trans_uncommit(
994 xfs_trans_unreserve_and_mod_sb(tp); 994 xfs_trans_unreserve_and_mod_sb(tp);
995 xfs_trans_unreserve_and_mod_dquots(tp); 995 xfs_trans_unreserve_and_mod_dquots(tp);
996 996
997 xfs_trans_free_items(tp, flags); 997 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
998 xfs_trans_free(tp); 998 xfs_trans_free(tp);
999} 999}
1000 1000
@@ -1144,6 +1144,93 @@ xfs_trans_commit_iclog(
1144 return xfs_log_release_iclog(mp, commit_iclog); 1144 return xfs_log_release_iclog(mp, commit_iclog);
1145} 1145}
1146 1146
1147/*
1148 * Walk the log items and allocate log vector structures for
1149 * each item large enough to fit all the vectors they require.
1150 * Note that this format differs from the old log vector format in
1151 * that there is no transaction header in these log vectors.
1152 */
1153STATIC struct xfs_log_vec *
1154xfs_trans_alloc_log_vecs(
1155 xfs_trans_t *tp)
1156{
1157 xfs_log_item_desc_t *lidp;
1158 struct xfs_log_vec *lv = NULL;
1159 struct xfs_log_vec *ret_lv = NULL;
1160
1161 lidp = xfs_trans_first_item(tp);
1162
1163 /* Bail out if we didn't find a log item. */
1164 if (!lidp) {
1165 ASSERT(0);
1166 return NULL;
1167 }
1168
1169 while (lidp != NULL) {
1170 struct xfs_log_vec *new_lv;
1171
1172 /* Skip items which aren't dirty in this transaction. */
1173 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1174 lidp = xfs_trans_next_item(tp, lidp);
1175 continue;
1176 }
1177
1178 /* Skip items that do not have any vectors for writing */
1179 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1180 if (!lidp->lid_size) {
1181 lidp = xfs_trans_next_item(tp, lidp);
1182 continue;
1183 }
1184
1185 new_lv = kmem_zalloc(sizeof(*new_lv) +
1186 lidp->lid_size * sizeof(struct xfs_log_iovec),
1187 KM_SLEEP);
1188
1189 /* The allocated iovec region lies beyond the log vector. */
1190 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
1191 new_lv->lv_niovecs = lidp->lid_size;
1192 new_lv->lv_item = lidp->lid_item;
1193 if (!ret_lv)
1194 ret_lv = new_lv;
1195 else
1196 lv->lv_next = new_lv;
1197 lv = new_lv;
1198 lidp = xfs_trans_next_item(tp, lidp);
1199 }
1200
1201 return ret_lv;
1202}
1203
1204static int
1205xfs_trans_commit_cil(
1206 struct xfs_mount *mp,
1207 struct xfs_trans *tp,
1208 xfs_lsn_t *commit_lsn,
1209 int flags)
1210{
1211 struct xfs_log_vec *log_vector;
1212 int error;
1213
1214 /*
1215 * Get each log item to allocate a vector structure for
1216 * the log item to to pass to the log write code. The
1217 * CIL commit code will format the vector and save it away.
1218 */
1219 log_vector = xfs_trans_alloc_log_vecs(tp);
1220 if (!log_vector)
1221 return ENOMEM;
1222
1223 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1224 if (error)
1225 return error;
1226
1227 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1228
1229 /* xfs_trans_free_items() unlocks them first */
1230 xfs_trans_free_items(tp, *commit_lsn, 0);
1231 xfs_trans_free(tp);
1232 return 0;
1233}
1147 1234
1148/* 1235/*
1149 * xfs_trans_commit 1236 * xfs_trans_commit
@@ -1204,7 +1291,11 @@ _xfs_trans_commit(
1204 xfs_trans_apply_sb_deltas(tp); 1291 xfs_trans_apply_sb_deltas(tp);
1205 xfs_trans_apply_dquot_deltas(tp); 1292 xfs_trans_apply_dquot_deltas(tp);
1206 1293
1207 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); 1294 if (mp->m_flags & XFS_MOUNT_DELAYLOG)
1295 error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
1296 else
1297 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
1298
1208 if (error == ENOMEM) { 1299 if (error == ENOMEM) {
1209 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1300 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1210 error = XFS_ERROR(EIO); 1301 error = XFS_ERROR(EIO);
@@ -1242,7 +1333,7 @@ out_unreserve:
1242 error = XFS_ERROR(EIO); 1333 error = XFS_ERROR(EIO);
1243 } 1334 }
1244 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1335 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1245 xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0); 1336 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
1246 xfs_trans_free(tp); 1337 xfs_trans_free(tp);
1247 1338
1248 XFS_STATS_INC(xs_trans_empty); 1339 XFS_STATS_INC(xs_trans_empty);
@@ -1320,7 +1411,7 @@ xfs_trans_cancel(
1320 /* mark this thread as no longer being in a transaction */ 1411 /* mark this thread as no longer being in a transaction */
1321 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1412 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1322 1413
1323 xfs_trans_free_items(tp, flags); 1414 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1324 xfs_trans_free(tp); 1415 xfs_trans_free(tp);
1325} 1416}
1326 1417
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ff7e9e6eee84..b1ea20c66b3e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -106,7 +106,8 @@ typedef struct xfs_trans_header {
106#define XFS_TRANS_GROWFSRT_FREE 39 106#define XFS_TRANS_GROWFSRT_FREE 39
107#define XFS_TRANS_SWAPEXT 40 107#define XFS_TRANS_SWAPEXT 40
108#define XFS_TRANS_SB_COUNT 41 108#define XFS_TRANS_SB_COUNT 41
109#define XFS_TRANS_TYPE_MAX 41 109#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42
110/* new transaction types need to be reflected in xfs_logprint(8) */ 111/* new transaction types need to be reflected in xfs_logprint(8) */
111 112
112#define XFS_TRANS_TYPES \ 113#define XFS_TRANS_TYPES \
@@ -148,6 +149,7 @@ typedef struct xfs_trans_header {
148 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 149 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
149 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 150 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
150 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ 151 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
152 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
151 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 153 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
152 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 154 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
153 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 155 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
@@ -829,6 +831,10 @@ typedef struct xfs_log_item {
829 /* buffer item iodone */ 831 /* buffer item iodone */
830 /* callback func */ 832 /* callback func */
831 struct xfs_item_ops *li_ops; /* function list */ 833 struct xfs_item_ops *li_ops; /* function list */
834
835 /* delayed logging */
836 struct list_head li_cil; /* CIL pointers */
837 struct xfs_log_vec *li_lv; /* active log vector */
832} xfs_log_item_t; 838} xfs_log_item_t;
833 839
834#define XFS_LI_IN_AIL 0x1 840#define XFS_LI_IN_AIL 0x1
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 2937a1e53318..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
299void 299void
300xfs_trans_free_items( 300xfs_trans_free_items(
301 xfs_trans_t *tp, 301 xfs_trans_t *tp,
302 xfs_lsn_t commit_lsn,
302 int flags) 303 int flags)
303{ 304{
304 xfs_log_item_chunk_t *licp; 305 xfs_log_item_chunk_t *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
311 * Special case the embedded chunk so we don't free it below. 312 * Special case the embedded chunk so we don't free it below.
312 */ 313 */
313 if (!xfs_lic_are_all_free(licp)) { 314 if (!xfs_lic_are_all_free(licp)) {
314 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 315 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
315 xfs_lic_all_free(licp); 316 xfs_lic_all_free(licp);
316 licp->lic_unused = 0; 317 licp->lic_unused = 0;
317 } 318 }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
322 */ 323 */
323 while (licp != NULL) { 324 while (licp != NULL) {
324 ASSERT(!xfs_lic_are_all_free(licp)); 325 ASSERT(!xfs_lic_are_all_free(licp));
325 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 326 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
326 next_licp = licp->lic_next; 327 next_licp = licp->lic_next;
327 kmem_free(licp); 328 kmem_free(licp);
328 licp = next_licp; 329 licp = next_licp;
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 901dc0f032da..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,9 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); 35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, 36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
37 struct xfs_log_item_desc *); 37 struct xfs_log_item_desc *);
38void xfs_trans_free_items(struct xfs_trans *, int); 38
39void xfs_trans_unlock_items(struct xfs_trans *, 39void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
40 xfs_lsn_t); 40void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
41 int flags);
42
43void xfs_trans_item_committed(struct xfs_log_item *lip,
44 xfs_lsn_t commit_lsn, int aborted);
45void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
41 46
42/* 47/*
43 * AIL traversal cursor. 48 * AIL traversal cursor.