aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/xfs/xfs_buf_item.c1
-rw-r--r--fs/xfs/xfs_dquot.c1
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_inode_item.c1
-rw-r--r--fs/xfs/xfs_log_cil.c258
-rw-r--r--fs/xfs/xfs_trans.h1
7 files changed, 202 insertions, 64 deletions
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 34257992934c..4561b1e2f2f9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -949,6 +949,7 @@ xfs_buf_item_free(
949 xfs_buf_log_item_t *bip) 949 xfs_buf_log_item_t *bip)
950{ 950{
951 xfs_buf_item_free_format(bip); 951 xfs_buf_item_free_format(bip);
952 kmem_free(bip->bli_item.li_lv_shadow);
952 kmem_zone_free(xfs_buf_item_zone, bip); 953 kmem_zone_free(xfs_buf_item_zone, bip);
953} 954}
954 955
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index e0646659ce16..ccb0811963b2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
74{ 74{
75 ASSERT(list_empty(&dqp->q_lru)); 75 ASSERT(list_empty(&dqp->q_lru));
76 76
77 kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
77 mutex_destroy(&dqp->q_qlock); 78 mutex_destroy(&dqp->q_qlock);
78 79
79 XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); 80 XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 814cff94e78f..2c7a1629e064 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
370 spin_lock(&ailp->xa_lock); 370 spin_lock(&ailp->xa_lock);
371 xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); 371 xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
372 372
373 kmem_free(qfs->qql_item.li_lv_shadow);
374 kmem_free(lip->li_lv_shadow);
373 kmem_free(qfs); 375 kmem_free(qfs);
374 kmem_free(qfe); 376 kmem_free(qfe);
375 return (xfs_lsn_t)-1; 377 return (xfs_lsn_t)-1;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 4aa0153214f9..ab779460ecbf 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,6 +40,7 @@ void
40xfs_efi_item_free( 40xfs_efi_item_free(
41 struct xfs_efi_log_item *efip) 41 struct xfs_efi_log_item *efip)
42{ 42{
43 kmem_free(efip->efi_item.li_lv_shadow);
43 if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) 44 if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
44 kmem_free(efip); 45 kmem_free(efip);
45 else 46 else
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
300STATIC void 301STATIC void
301xfs_efd_item_free(struct xfs_efd_log_item *efdp) 302xfs_efd_item_free(struct xfs_efd_log_item *efdp)
302{ 303{
304 kmem_free(efdp->efd_item.li_lv_shadow);
303 if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) 305 if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
304 kmem_free(efdp); 306 kmem_free(efdp);
305 else 307 else
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a1b07612224c..892c2aced207 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,6 +651,7 @@ void
651xfs_inode_item_destroy( 651xfs_inode_item_destroy(
652 xfs_inode_t *ip) 652 xfs_inode_t *ip)
653{ 653{
654 kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
654 kmem_zone_free(xfs_ili_zone, ip->i_itemp); 655 kmem_zone_free(xfs_ili_zone, ip->i_itemp);
655} 656}
656 657
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5e54e7955ea6..a4ab192e1792 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
78 log->l_cilp->xc_ctx->sequence = 1; 78 log->l_cilp->xc_ctx->sequence = 1;
79} 79}
80 80
81static inline int
82xlog_cil_iovec_space(
83 uint niovecs)
84{
85 return round_up((sizeof(struct xfs_log_vec) +
86 niovecs * sizeof(struct xfs_log_iovec)),
87 sizeof(uint64_t));
88}
89
90/*
91 * Allocate or pin log vector buffers for CIL insertion.
92 *
93 * The CIL currently uses disposable buffers for copying a snapshot of the
94 * modified items into the log during a push. The biggest problem with this is
95 * the requirement to allocate the disposable buffer during the commit if:
96 * a) does not exist; or
97 * b) it is too small
98 *
99 * If we do this allocation within xlog_cil_insert_format_items(), it is done
100 * under the xc_ctx_lock, which means that a CIL push cannot occur during
101 * the memory allocation. This means that we have a potential deadlock situation
102 * under low memory conditions when we have lots of dirty metadata pinned in
103 * the CIL and we need a CIL commit to occur to free memory.
104 *
105 * To avoid this, we need to move the memory allocation outside the
106 * xc_ctx_lock, but because the log vector buffers are disposable, that opens
107 * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
108 * vector buffers between the check and the formatting of the item into the
109 * log vector buffer within the xc_ctx_lock.
110 *
111 * Because the log vector buffer needs to be unchanged during the CIL push
112 * process, we cannot share the buffer between the transaction commit (which
113 * modifies the buffer) and the CIL push context that is writing the changes
114 * into the log. This means skipping preallocation of buffer space is
115 * unreliable, but we most definitely do not want to be allocating and freeing
116 * buffers unnecessarily during commits when overwrites can be done safely.
117 *
118 * The simplest solution to this problem is to allocate a shadow buffer when a
119 * log item is committed for the second time, and then to only use this buffer
120 * if necessary. The buffer can remain attached to the log item until such time
121 * it is needed, and this is the buffer that is reallocated to match the size of
122 * the incoming modification. Then during the formatting of the item we can swap
123 * the active buffer with the new one if we can't reuse the existing buffer. We
124 * don't free the old buffer as it may be reused on the next modification if
125 * it's size is right, otherwise we'll free and reallocate it at that point.
126 *
127 * This function builds a vector for the changes in each log item in the
128 * transaction. It then works out the length of the buffer needed for each log
129 * item, allocates them and attaches the vector to the log item in preparation
130 * for the formatting step which occurs under the xc_ctx_lock.
131 *
132 * While this means the memory footprint goes up, it avoids the repeated
133 * alloc/free pattern that repeated modifications of an item would otherwise
134 * cause, and hence minimises the CPU overhead of such behaviour.
135 */
136static void
137xlog_cil_alloc_shadow_bufs(
138 struct xlog *log,
139 struct xfs_trans *tp)
140{
141 struct xfs_log_item_desc *lidp;
142
143 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
144 struct xfs_log_item *lip = lidp->lid_item;
145 struct xfs_log_vec *lv;
146 int niovecs = 0;
147 int nbytes = 0;
148 int buf_size;
149 bool ordered = false;
150
151 /* Skip items which aren't dirty in this transaction. */
152 if (!(lidp->lid_flags & XFS_LID_DIRTY))
153 continue;
154
155 /* get number of vecs and size of data to be stored */
156 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
157
158 /*
159 * Ordered items need to be tracked but we do not wish to write
160 * them. We need a logvec to track the object, but we do not
161 * need an iovec or buffer to be allocated for copying data.
162 */
163 if (niovecs == XFS_LOG_VEC_ORDERED) {
164 ordered = true;
165 niovecs = 0;
166 nbytes = 0;
167 }
168
169 /*
170 * We 64-bit align the length of each iovec so that the start
171 * of the next one is naturally aligned. We'll need to
172 * account for that slack space here. Then round nbytes up
173 * to 64-bit alignment so that the initial buffer alignment is
174 * easy to calculate and verify.
175 */
176 nbytes += niovecs * sizeof(uint64_t);
177 nbytes = round_up(nbytes, sizeof(uint64_t));
178
179 /*
180 * The data buffer needs to start 64-bit aligned, so round up
181 * that space to ensure we can align it appropriately and not
182 * overrun the buffer.
183 */
184 buf_size = nbytes + xlog_cil_iovec_space(niovecs);
185
186 /*
187 * if we have no shadow buffer, or it is too small, we need to
188 * reallocate it.
189 */
190 if (!lip->li_lv_shadow ||
191 buf_size > lip->li_lv_shadow->lv_size) {
192
193 /*
194 * We free and allocate here as a realloc would copy
195 * unecessary data. We don't use kmem_zalloc() for the
196 * same reason - we don't need to zero the data area in
197 * the buffer, only the log vector header and the iovec
198 * storage.
199 */
200 kmem_free(lip->li_lv_shadow);
201
202 lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
203 memset(lv, 0, xlog_cil_iovec_space(niovecs));
204
205 lv->lv_item = lip;
206 lv->lv_size = buf_size;
207 if (ordered)
208 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
209 else
210 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
211 lip->li_lv_shadow = lv;
212 } else {
213 /* same or smaller, optimise common overwrite case */
214 lv = lip->li_lv_shadow;
215 if (ordered)
216 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
217 else
218 lv->lv_buf_len = 0;
219 lv->lv_bytes = 0;
220 lv->lv_next = NULL;
221 }
222
223 /* Ensure the lv is set up according to ->iop_size */
224 lv->lv_niovecs = niovecs;
225
226 /* The allocated data region lies beyond the iovec region */
227 lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
228 }
229
230}
231
81/* 232/*
82 * Prepare the log item for insertion into the CIL. Calculate the difference in 233 * Prepare the log item for insertion into the CIL. Calculate the difference in
83 * log space and vectors it will consume, and if it is a new item pin it as 234 * log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
100 /* 251 /*
101 * If there is no old LV, this is the first time we've seen the item in 252 * If there is no old LV, this is the first time we've seen the item in
102 * this CIL context and so we need to pin it. If we are replacing the 253 * this CIL context and so we need to pin it. If we are replacing the
103 * old_lv, then remove the space it accounts for and free it. 254 * old_lv, then remove the space it accounts for and make it the shadow
255 * buffer for later freeing. In both cases we are now switching to the
256 * shadow buffer, so update the the pointer to it appropriately.
104 */ 257 */
105 if (!old_lv) 258 if (!old_lv) {
106 lv->lv_item->li_ops->iop_pin(lv->lv_item); 259 lv->lv_item->li_ops->iop_pin(lv->lv_item);
107 else if (old_lv != lv) { 260 lv->lv_item->li_lv_shadow = NULL;
261 } else if (old_lv != lv) {
108 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); 262 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
109 263
110 *diff_len -= old_lv->lv_bytes; 264 *diff_len -= old_lv->lv_bytes;
111 *diff_iovecs -= old_lv->lv_niovecs; 265 *diff_iovecs -= old_lv->lv_niovecs;
112 kmem_free(old_lv); 266 lv->lv_item->li_lv_shadow = old_lv;
113 } 267 }
114 268
115 /* attach new log vector to log item */ 269 /* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
133 * write it out asynchronously without needing to relock the object that was 287 * write it out asynchronously without needing to relock the object that was
134 * modified at the time it gets written into the iclog. 288 * modified at the time it gets written into the iclog.
135 * 289 *
136 * This function builds a vector for the changes in each log item in the 290 * This function takes the prepared log vectors attached to each log item, and
137 * transaction. It then works out the length of the buffer needed for each log 291 * formats the changes into the log vector buffer. The buffer it uses is
138 * item, allocates them and formats the vector for the item into the buffer. 292 * dependent on the current state of the vector in the CIL - the shadow lv is
139 * The buffer is then attached to the log item are then inserted into the 293 * guaranteed to be large enough for the current modification, but we will only
140 * Committed Item List for tracking until the next checkpoint is written out. 294 * use that if we can't reuse the existing lv. If we can't reuse the existing
295 * lv, then simple swap it out for the shadow lv. We don't free it - that is
296 * done lazily either by th enext modification or the freeing of the log item.
141 * 297 *
142 * We don't set up region headers during this process; we simply copy the 298 * We don't set up region headers during this process; we simply copy the
143 * regions into the flat buffer. We can do this because we still have to do a 299 * regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
170 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 326 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
171 struct xfs_log_item *lip = lidp->lid_item; 327 struct xfs_log_item *lip = lidp->lid_item;
172 struct xfs_log_vec *lv; 328 struct xfs_log_vec *lv;
173 struct xfs_log_vec *old_lv; 329 struct xfs_log_vec *old_lv = NULL;
174 int niovecs = 0; 330 struct xfs_log_vec *shadow;
175 int nbytes = 0;
176 int buf_size;
177 bool ordered = false; 331 bool ordered = false;
178 332
179 /* Skip items which aren't dirty in this transaction. */ 333 /* Skip items which aren't dirty in this transaction. */
180 if (!(lidp->lid_flags & XFS_LID_DIRTY)) 334 if (!(lidp->lid_flags & XFS_LID_DIRTY))
181 continue; 335 continue;
182 336
183 /* get number of vecs and size of data to be stored */
184 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
185
186 /* Skip items that do not have any vectors for writing */
187 if (!niovecs)
188 continue;
189
190 /* 337 /*
191 * Ordered items need to be tracked but we do not wish to write 338 * The formatting size information is already attached to
192 * them. We need a logvec to track the object, but we do not 339 * the shadow lv on the log item.
193 * need an iovec or buffer to be allocated for copying data.
194 */ 340 */
195 if (niovecs == XFS_LOG_VEC_ORDERED) { 341 shadow = lip->li_lv_shadow;
342 if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
196 ordered = true; 343 ordered = true;
197 niovecs = 0;
198 nbytes = 0;
199 }
200 344
201 /* 345 /* Skip items that do not have any vectors for writing */
202 * We 64-bit align the length of each iovec so that the start 346 if (!shadow->lv_niovecs && !ordered)
203 * of the next one is naturally aligned. We'll need to 347 continue;
204 * account for that slack space here. Then round nbytes up
205 * to 64-bit alignment so that the initial buffer alignment is
206 * easy to calculate and verify.
207 */
208 nbytes += niovecs * sizeof(uint64_t);
209 nbytes = round_up(nbytes, sizeof(uint64_t));
210
211 /* grab the old item if it exists for reservation accounting */
212 old_lv = lip->li_lv;
213
214 /*
215 * The data buffer needs to start 64-bit aligned, so round up
216 * that space to ensure we can align it appropriately and not
217 * overrun the buffer.
218 */
219 buf_size = nbytes +
220 round_up((sizeof(struct xfs_log_vec) +
221 niovecs * sizeof(struct xfs_log_iovec)),
222 sizeof(uint64_t));
223 348
224 /* compare to existing item size */ 349 /* compare to existing item size */
225 if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { 350 old_lv = lip->li_lv;
351 if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
226 /* same or smaller, optimise common overwrite case */ 352 /* same or smaller, optimise common overwrite case */
227 lv = lip->li_lv; 353 lv = lip->li_lv;
228 lv->lv_next = NULL; 354 lv->lv_next = NULL;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
236 */ 362 */
237 *diff_iovecs -= lv->lv_niovecs; 363 *diff_iovecs -= lv->lv_niovecs;
238 *diff_len -= lv->lv_bytes; 364 *diff_len -= lv->lv_bytes;
365
366 /* Ensure the lv is set up according to ->iop_size */
367 lv->lv_niovecs = shadow->lv_niovecs;
368
369 /* reset the lv buffer information for new formatting */
370 lv->lv_buf_len = 0;
371 lv->lv_bytes = 0;
372 lv->lv_buf = (char *)lv +
373 xlog_cil_iovec_space(lv->lv_niovecs);
239 } else { 374 } else {
240 /* allocate new data chunk */ 375 /* switch to shadow buffer! */
241 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); 376 lv = shadow;
242 lv->lv_item = lip; 377 lv->lv_item = lip;
243 lv->lv_size = buf_size;
244 if (ordered) { 378 if (ordered) {
245 /* track as an ordered logvec */ 379 /* track as an ordered logvec */
246 ASSERT(lip->li_lv == NULL); 380 ASSERT(lip->li_lv == NULL);
247 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
248 goto insert; 381 goto insert;
249 } 382 }
250 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
251 } 383 }
252 384
253 /* Ensure the lv is set up according to ->iop_size */
254 lv->lv_niovecs = niovecs;
255
256 /* The allocated data region lies beyond the iovec region */
257 lv->lv_buf_len = 0;
258 lv->lv_bytes = 0;
259 lv->lv_buf = (char *)lv + buf_size - nbytes;
260 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); 385 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
261
262 lip->li_ops->iop_format(lip, lv); 386 lip->li_ops->iop_format(lip, lv);
263insert: 387insert:
264 ASSERT(lv->lv_buf_len <= nbytes);
265 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); 388 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
266 } 389 }
267} 390}
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
783 struct xlog *log = mp->m_log; 906 struct xlog *log = mp->m_log;
784 struct xfs_cil *cil = log->l_cilp; 907 struct xfs_cil *cil = log->l_cilp;
785 908
909 /*
910 * Do all necessary memory allocation before we lock the CIL.
911 * This ensures the allocation does not deadlock with a CIL
912 * push in memory reclaim (e.g. from kswapd).
913 */
914 xlog_cil_alloc_shadow_bufs(log, tp);
915
786 /* lock out background commit */ 916 /* lock out background commit */
787 down_read(&cil->xc_ctx_lock); 917 down_read(&cil->xc_ctx_lock);
788 918
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9a462e892e4f..9b2b9fa89331 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
52 /* delayed logging */ 52 /* delayed logging */
53 struct list_head li_cil; /* CIL pointers */ 53 struct list_head li_cil; /* CIL pointers */
54 struct xfs_log_vec *li_lv; /* active log vector */ 54 struct xfs_log_vec *li_lv; /* active log vector */
55 struct xfs_log_vec *li_lv_shadow; /* standby vector */
55 xfs_lsn_t li_seq; /* CIL commit seq */ 56 xfs_lsn_t li_seq; /* CIL commit seq */
56} xfs_log_item_t; 57} xfs_log_item_t;
57 58