aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2013-08-28 02:10:35 -0400
committerBen Myers <bpm@sgi.com>2013-08-30 14:59:30 -0400
commit239567033c38933c4d6f402f9f8a2126df73e4c6 (patch)
tree26881a5e045d7d509c86bd7b63e30dafc6db912f /fs/xfs
parentb121099d84b0311a26ca04d33961febb33580fe4 (diff)
xfs: inode log reservations are too small
We've been seeing occasional problems with log space leaks and transaction underruns such as this for some time: XFS (dm-0): xlog_write: reservation summary: trans type = FSYNC_TS (36) unit res = 2740 bytes current res = -4 bytes total reg = 0 bytes (o/flow = 0 bytes) ophdrs = 0 (ophdr space = 0 bytes) ophdr + reg = 0 bytes num regions = 0 Turns out that xfstests generic/311 is reliably reproducing this problem with the test it runs at sequence 16 of it execution. It is a 100% reliable reproducer with the mkfs configuration of "-b size=1024 -m crc=1" on a 10GB scratch device. The problem? Inode forks in btree format are logged in memory format, not disk format (i.e. bmbt format, not bmdr format). That means there is a btree block header being logged, when such a structure is never written to the inode fork in bmdr format. The bmdr header in the inode is only 4 bytes, while the bmbt header is 24 bytes for v4 filesystems and 72 bytes for v5 filesystems. We currently reserve the inode size plus the rounded up overhead of a logging a buffer, which is 128 bytes. That means the reservation for a 512 byte inode is 640 bytes. What we can actually log is: inode core, data and attr fork = 512 bytes inode log format + log op header = 56 + 12 = 68 bytes data fork bmbt hdr = 24/72 bytes attr fork bmbt hdr = 24/72 bytes So, for a v2 inodes we can log at least 628 bytes, but if we split that inode over the end of the log across log buffers, we need to also another log op header, which takes us to 640 bytes. If there's another reservation taken out of this that I haven't taken into account (perhaps multiple iclog splits?) or I haven't corectly calculated the bmbt format space used (entirely possible), then we will overun it. For v3 inodes the maximum is actually 724 bytes, and even a single maximally sized btree format fork can blow it (652 bytes). And that's exactly what is happening with the FSYNC_TS transaction in the above output - it's consumed 644 bytes of space after the CIL context took the space reserved for it (2100 bytes). This problem has always been present in the XFS code - the btree format inode forks have always been logged in this manner. Hence there has always been the possibility of an overrun with such a transaction. The CRC code has just exposed it frequently enough to be able to debug and understand the root cause.... So, let's fix all the inode log space reservations. [ I'm so glad we spent the effort to clean up the transaction reservation code. This is an easy fix now. ] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_trans_resv.c72
1 files changed, 53 insertions, 19 deletions
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 24110f36f729..a65a3cc40610 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -73,6 +73,39 @@ xfs_calc_buf_res(
73} 73}
74 74
75/* 75/*
76 * Logging inodes is really tricksy. They are logged in memory format,
77 * which means that what we write into the log doesn't directly translate into
78 * the amount of space they use on disk.
79 *
80 * Case in point - btree format forks in memory format use more space than the
81 * on-disk format. In memory, the buffer contains a normal btree block header so
82 * the btree code can treat it as though it is just another generic buffer.
83 * However, when we write it to the inode fork, we don't write all of this
84 * header as it isn't needed. e.g. the root is only ever in the inode, so
85 * there's no need for sibling pointers which would waste 16 bytes of space.
86 *
87 * Hence when we have an inode with a maximally sized btree format fork, then
88 * amount of information we actually log is greater than the size of the inode
89 * on disk. Hence we need an inode reservation function that calculates all this
90 * correctly. So, we log:
91 *
92 * - log op headers for object
93 * - inode log format object
94 * - the entire inode contents (core + 2 forks)
95 * - two bmap btree block headers
96 */
97STATIC uint
98xfs_calc_inode_res(
99 struct xfs_mount *mp,
100 uint ninodes)
101{
102 return ninodes * (sizeof(struct xlog_op_header) +
103 sizeof(struct xfs_inode_log_format) +
104 mp->m_sb.sb_inodesize +
105 2 * XFS_BMBT_BLOCK_LEN(mp));
106}
107
108/*
76 * Various log reservation values. 109 * Various log reservation values.
77 * 110 *
78 * These are based on the size of the file system block because that is what 111 * These are based on the size of the file system block because that is what
@@ -111,7 +144,7 @@ xfs_calc_write_reservation(
111 struct xfs_mount *mp) 144 struct xfs_mount *mp)
112{ 145{
113 return XFS_DQUOT_LOGRES(mp) + 146 return XFS_DQUOT_LOGRES(mp) +
114 MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + 147 MAX((xfs_calc_inode_res(mp, 1) +
115 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 148 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
116 XFS_FSB_TO_B(mp, 1)) + 149 XFS_FSB_TO_B(mp, 1)) +
117 xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + 150 xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
@@ -140,7 +173,7 @@ xfs_calc_itruncate_reservation(
140 struct xfs_mount *mp) 173 struct xfs_mount *mp)
141{ 174{
142 return XFS_DQUOT_LOGRES(mp) + 175 return XFS_DQUOT_LOGRES(mp) +
143 MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + 176 MAX((xfs_calc_inode_res(mp, 1) +
144 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, 177 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
145 XFS_FSB_TO_B(mp, 1))), 178 XFS_FSB_TO_B(mp, 1))),
146 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + 179 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
@@ -170,7 +203,7 @@ xfs_calc_rename_reservation(
170 struct xfs_mount *mp) 203 struct xfs_mount *mp)
171{ 204{
172 return XFS_DQUOT_LOGRES(mp) + 205 return XFS_DQUOT_LOGRES(mp) +
173 MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) + 206 MAX((xfs_calc_inode_res(mp, 4) +
174 xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), 207 xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
175 XFS_FSB_TO_B(mp, 1))), 208 XFS_FSB_TO_B(mp, 1))),
176 (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + 209 (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
@@ -195,7 +228,7 @@ xfs_calc_link_reservation(
195 struct xfs_mount *mp) 228 struct xfs_mount *mp)
196{ 229{
197 return XFS_DQUOT_LOGRES(mp) + 230 return XFS_DQUOT_LOGRES(mp) +
198 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 231 MAX((xfs_calc_inode_res(mp, 2) +
199 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 232 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
200 XFS_FSB_TO_B(mp, 1))), 233 XFS_FSB_TO_B(mp, 1))),
201 (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + 234 (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
@@ -220,7 +253,7 @@ xfs_calc_remove_reservation(
220 struct xfs_mount *mp) 253 struct xfs_mount *mp)
221{ 254{
222 return XFS_DQUOT_LOGRES(mp) + 255 return XFS_DQUOT_LOGRES(mp) +
223 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 256 MAX((xfs_calc_inode_res(mp, 2) +
224 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 257 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
225 XFS_FSB_TO_B(mp, 1))), 258 XFS_FSB_TO_B(mp, 1))),
226 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + 259 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
@@ -247,7 +280,7 @@ STATIC uint
247xfs_calc_create_resv_modify( 280xfs_calc_create_resv_modify(
248 struct xfs_mount *mp) 281 struct xfs_mount *mp)
249{ 282{
250 return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 283 return xfs_calc_inode_res(mp, 2) +
251 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + 284 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
252 (uint)XFS_FSB_TO_B(mp, 1) + 285 (uint)XFS_FSB_TO_B(mp, 1) +
253 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); 286 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
@@ -357,7 +390,7 @@ xfs_calc_ifree_reservation(
357 struct xfs_mount *mp) 390 struct xfs_mount *mp)
358{ 391{
359 return XFS_DQUOT_LOGRES(mp) + 392 return XFS_DQUOT_LOGRES(mp) +
360 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + 393 xfs_calc_inode_res(mp, 1) +
361 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 394 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
362 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 395 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
363 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), 396 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
@@ -378,9 +411,8 @@ xfs_calc_ichange_reservation(
378 struct xfs_mount *mp) 411 struct xfs_mount *mp)
379{ 412{
380 return XFS_DQUOT_LOGRES(mp) + 413 return XFS_DQUOT_LOGRES(mp) +
381 mp->m_sb.sb_inodesize + 414 xfs_calc_inode_res(mp, 1) +
382 mp->m_sb.sb_sectsize + 415 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
383 512;
384 416
385} 417}
386 418
@@ -416,7 +448,7 @@ xfs_calc_growrtalloc_reservation(
416 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 448 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
417 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 449 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
418 XFS_FSB_TO_B(mp, 1)) + 450 XFS_FSB_TO_B(mp, 1)) +
419 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + 451 xfs_calc_inode_res(mp, 1) +
420 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 452 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
421 XFS_FSB_TO_B(mp, 1)); 453 XFS_FSB_TO_B(mp, 1));
422} 454}
@@ -448,7 +480,7 @@ xfs_calc_growrtfree_reservation(
448 struct xfs_mount *mp) 480 struct xfs_mount *mp)
449{ 481{
450 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + 482 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
451 xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 483 xfs_calc_inode_res(mp, 2) +
452 xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + 484 xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
453 xfs_calc_buf_res(1, mp->m_rsumsize); 485 xfs_calc_buf_res(1, mp->m_rsumsize);
454} 486}
@@ -461,7 +493,7 @@ STATIC uint
461xfs_calc_swrite_reservation( 493xfs_calc_swrite_reservation(
462 struct xfs_mount *mp) 494 struct xfs_mount *mp)
463{ 495{
464 return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); 496 return xfs_calc_inode_res(mp, 1);
465} 497}
466 498
467/* 499/*
@@ -469,9 +501,10 @@ xfs_calc_swrite_reservation(
469 * inode 501 * inode
470 */ 502 */
471STATIC uint 503STATIC uint
472xfs_calc_writeid_reservation(xfs_mount_t *mp) 504xfs_calc_writeid_reservation(
505 struct xfs_mount *mp)
473{ 506{
474 return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); 507 return xfs_calc_inode_res(mp, 1);
475} 508}
476 509
477/* 510/*
@@ -487,7 +520,7 @@ xfs_calc_addafork_reservation(
487 struct xfs_mount *mp) 520 struct xfs_mount *mp)
488{ 521{
489 return XFS_DQUOT_LOGRES(mp) + 522 return XFS_DQUOT_LOGRES(mp) +
490 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + 523 xfs_calc_inode_res(mp, 1) +
491 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 524 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
492 xfs_calc_buf_res(1, mp->m_dirblksize) + 525 xfs_calc_buf_res(1, mp->m_dirblksize) +
493 xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, 526 xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
@@ -511,7 +544,7 @@ STATIC uint
511xfs_calc_attrinval_reservation( 544xfs_calc_attrinval_reservation(
512 struct xfs_mount *mp) 545 struct xfs_mount *mp)
513{ 546{
514 return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + 547 return MAX((xfs_calc_inode_res(mp, 1) +
515 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), 548 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
516 XFS_FSB_TO_B(mp, 1))), 549 XFS_FSB_TO_B(mp, 1))),
517 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + 550 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
@@ -535,7 +568,7 @@ xfs_calc_attrsetm_reservation(
535 struct xfs_mount *mp) 568 struct xfs_mount *mp)
536{ 569{
537 return XFS_DQUOT_LOGRES(mp) + 570 return XFS_DQUOT_LOGRES(mp) +
538 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + 571 xfs_calc_inode_res(mp, 1) +
539 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + 572 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
540 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); 573 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
541} 574}
@@ -575,7 +608,7 @@ xfs_calc_attrrm_reservation(
575 struct xfs_mount *mp) 608 struct xfs_mount *mp)
576{ 609{
577 return XFS_DQUOT_LOGRES(mp) + 610 return XFS_DQUOT_LOGRES(mp) +
578 MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + 611 MAX((xfs_calc_inode_res(mp, 1) +
579 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, 612 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
580 XFS_FSB_TO_B(mp, 1)) + 613 XFS_FSB_TO_B(mp, 1)) +
581 (uint)XFS_FSB_TO_B(mp, 614 (uint)XFS_FSB_TO_B(mp,
@@ -627,6 +660,7 @@ STATIC uint
627xfs_calc_qm_dqalloc_reservation( 660xfs_calc_qm_dqalloc_reservation(
628 struct xfs_mount *mp) 661 struct xfs_mount *mp)
629{ 662{
663 ASSERT(M_RES(mp)->tr_write.tr_logres);
630 return M_RES(mp)->tr_write.tr_logres + 664 return M_RES(mp)->tr_write.tr_logres +
631 xfs_calc_buf_res(1, 665 xfs_calc_buf_res(1,
632 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); 666 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);