aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2017-01-25 10:49:35 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-04-08 03:30:30 -0400
commit08a2a26816825b2724fa6e2616df716b31e4a582 (patch)
tree64d5730429db67791b16a190a5659cf71989688a
parent9be1c33d4a995d6369b94c7bb6ae0e8d18e7d658 (diff)
xfs: use per-AG reservations for the finobt
commit 76d771b4cbe33c581bd6ca2710c120be51172440 upstream. Currently we try to rely on the global reserved block pool for block allocations for the free inode btree, but I have customer reports (fairly complex workload, need to find an easier reproducer) where that is not enough as the AG where we free an inode that requires a new finobt block is entirely full. This causes us to cancel a dirty transaction and thus a file system shutdown. I think the right way to guard against this is to treat the finot the same way as the refcount btree and have a per-AG reservations for the possible worst case size of it, and the patch below implements that. Note that this could increase mount times with large finobt trees. In an ideal world we would have added a field for the number of finobt fields to the AGI, similar to what we did for the refcount blocks. We should do add it next time we rev the AGI or AGF format by adding new fields. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c47
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c90
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h3
-rw-r--r--fs/xfs/xfs_inode.c23
-rw-r--r--fs/xfs/xfs_mount.h1
5 files changed, 144 insertions, 20 deletions
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 94234bff40dc..33db69be4832 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -39,6 +39,7 @@
39#include "xfs_rmap_btree.h" 39#include "xfs_rmap_btree.h"
40#include "xfs_btree.h" 40#include "xfs_btree.h"
41#include "xfs_refcount_btree.h" 41#include "xfs_refcount_btree.h"
42#include "xfs_ialloc_btree.h"
42 43
43/* 44/*
44 * Per-AG Block Reservations 45 * Per-AG Block Reservations
@@ -210,6 +211,9 @@ __xfs_ag_resv_init(
210 if (error) { 211 if (error) {
211 trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, 212 trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
212 error, _RET_IP_); 213 error, _RET_IP_);
214 xfs_warn(mp,
215"Per-AG reservation for AG %u failed. Filesystem may run out of space.",
216 pag->pag_agno);
213 return error; 217 return error;
214 } 218 }
215 219
@@ -228,6 +232,8 @@ int
228xfs_ag_resv_init( 232xfs_ag_resv_init(
229 struct xfs_perag *pag) 233 struct xfs_perag *pag)
230{ 234{
235 struct xfs_mount *mp = pag->pag_mount;
236 xfs_agnumber_t agno = pag->pag_agno;
231 xfs_extlen_t ask; 237 xfs_extlen_t ask;
232 xfs_extlen_t used; 238 xfs_extlen_t used;
233 int error = 0; 239 int error = 0;
@@ -236,23 +242,45 @@ xfs_ag_resv_init(
236 if (pag->pag_meta_resv.ar_asked == 0) { 242 if (pag->pag_meta_resv.ar_asked == 0) {
237 ask = used = 0; 243 ask = used = 0;
238 244
239 error = xfs_refcountbt_calc_reserves(pag->pag_mount, 245 error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
240 pag->pag_agno, &ask, &used);
241 if (error) 246 if (error)
242 goto out; 247 goto out;
243 248
244 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, 249 error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
245 ask, used);
246 if (error) 250 if (error)
247 goto out; 251 goto out;
252
253 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
254 ask, used);
255 if (error) {
256 /*
257 * Because we didn't have per-AG reservations when the
258 * finobt feature was added we might not be able to
259 * reserve all needed blocks. Warn and fall back to the
260 * old and potentially buggy code in that case, but
261 * ensure we do have the reservation for the refcountbt.
262 */
263 ask = used = 0;
264
265 mp->m_inotbt_nores = true;
266
267 error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
268 &used);
269 if (error)
270 goto out;
271
272 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
273 ask, used);
274 if (error)
275 goto out;
276 }
248 } 277 }
249 278
250 /* Create the AGFL metadata reservation */ 279 /* Create the AGFL metadata reservation */
251 if (pag->pag_agfl_resv.ar_asked == 0) { 280 if (pag->pag_agfl_resv.ar_asked == 0) {
252 ask = used = 0; 281 ask = used = 0;
253 282
254 error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, 283 error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
255 &ask, &used);
256 if (error) 284 if (error)
257 goto out; 285 goto out;
258 286
@@ -261,9 +289,16 @@ xfs_ag_resv_init(
261 goto out; 289 goto out;
262 } 290 }
263 291
292#ifdef DEBUG
293 /* need to read in the AGF for the ASSERT below to work */
294 error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
295 if (error)
296 return error;
297
264 ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + 298 ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
265 xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= 299 xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
266 pag->pagf_freeblks + pag->pagf_flcount); 300 pag->pagf_freeblks + pag->pagf_flcount);
301#endif
267out: 302out:
268 return error; 303 return error;
269} 304}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 6c6b95947e71..b9c351ff0422 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -82,11 +82,12 @@ xfs_finobt_set_root(
82} 82}
83 83
84STATIC int 84STATIC int
85xfs_inobt_alloc_block( 85__xfs_inobt_alloc_block(
86 struct xfs_btree_cur *cur, 86 struct xfs_btree_cur *cur,
87 union xfs_btree_ptr *start, 87 union xfs_btree_ptr *start,
88 union xfs_btree_ptr *new, 88 union xfs_btree_ptr *new,
89 int *stat) 89 int *stat,
90 enum xfs_ag_resv_type resv)
90{ 91{
91 xfs_alloc_arg_t args; /* block allocation args */ 92 xfs_alloc_arg_t args; /* block allocation args */
92 int error; /* error return value */ 93 int error; /* error return value */
@@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
103 args.maxlen = 1; 104 args.maxlen = 1;
104 args.prod = 1; 105 args.prod = 1;
105 args.type = XFS_ALLOCTYPE_NEAR_BNO; 106 args.type = XFS_ALLOCTYPE_NEAR_BNO;
107 args.resv = resv;
106 108
107 error = xfs_alloc_vextent(&args); 109 error = xfs_alloc_vextent(&args);
108 if (error) { 110 if (error) {
@@ -123,6 +125,27 @@ xfs_inobt_alloc_block(
123} 125}
124 126
125STATIC int 127STATIC int
128xfs_inobt_alloc_block(
129 struct xfs_btree_cur *cur,
130 union xfs_btree_ptr *start,
131 union xfs_btree_ptr *new,
132 int *stat)
133{
134 return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
135}
136
137STATIC int
138xfs_finobt_alloc_block(
139 struct xfs_btree_cur *cur,
140 union xfs_btree_ptr *start,
141 union xfs_btree_ptr *new,
142 int *stat)
143{
144 return __xfs_inobt_alloc_block(cur, start, new, stat,
145 XFS_AG_RESV_METADATA);
146}
147
148STATIC int
126xfs_inobt_free_block( 149xfs_inobt_free_block(
127 struct xfs_btree_cur *cur, 150 struct xfs_btree_cur *cur,
128 struct xfs_buf *bp) 151 struct xfs_buf *bp)
@@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
328 351
329 .dup_cursor = xfs_inobt_dup_cursor, 352 .dup_cursor = xfs_inobt_dup_cursor,
330 .set_root = xfs_finobt_set_root, 353 .set_root = xfs_finobt_set_root,
331 .alloc_block = xfs_inobt_alloc_block, 354 .alloc_block = xfs_finobt_alloc_block,
332 .free_block = xfs_inobt_free_block, 355 .free_block = xfs_inobt_free_block,
333 .get_minrecs = xfs_inobt_get_minrecs, 356 .get_minrecs = xfs_inobt_get_minrecs,
334 .get_maxrecs = xfs_inobt_get_maxrecs, 357 .get_maxrecs = xfs_inobt_get_maxrecs,
@@ -478,3 +501,64 @@ xfs_inobt_rec_check_count(
478 return 0; 501 return 0;
479} 502}
480#endif /* DEBUG */ 503#endif /* DEBUG */
504
505static xfs_extlen_t
506xfs_inobt_max_size(
507 struct xfs_mount *mp)
508{
509 /* Bail out if we're uninitialized, which can happen in mkfs. */
510 if (mp->m_inobt_mxr[0] == 0)
511 return 0;
512
513 return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
514 (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
515 XFS_INODES_PER_CHUNK);
516}
517
518static int
519xfs_inobt_count_blocks(
520 struct xfs_mount *mp,
521 xfs_agnumber_t agno,
522 xfs_btnum_t btnum,
523 xfs_extlen_t *tree_blocks)
524{
525 struct xfs_buf *agbp;
526 struct xfs_btree_cur *cur;
527 int error;
528
529 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
530 if (error)
531 return error;
532
533 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
534 error = xfs_btree_count_blocks(cur, tree_blocks);
535 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
536 xfs_buf_relse(agbp);
537
538 return error;
539}
540
541/*
542 * Figure out how many blocks to reserve and how many are used by this btree.
543 */
544int
545xfs_finobt_calc_reserves(
546 struct xfs_mount *mp,
547 xfs_agnumber_t agno,
548 xfs_extlen_t *ask,
549 xfs_extlen_t *used)
550{
551 xfs_extlen_t tree_len = 0;
552 int error;
553
554 if (!xfs_sb_version_hasfinobt(&mp->m_sb))
555 return 0;
556
557 error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
558 if (error)
559 return error;
560
561 *ask += xfs_inobt_max_size(mp);
562 *used += tree_len;
563 return 0;
564}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index bd88453217ce..aa81e2e63f3f 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
72#define xfs_inobt_rec_check_count(mp, rec) 0 72#define xfs_inobt_rec_check_count(mp, rec) 0
73#endif /* DEBUG */ 73#endif /* DEBUG */
74 74
75int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
76 xfs_extlen_t *ask, xfs_extlen_t *used);
77
75#endif /* __XFS_IALLOC_BTREE_H__ */ 78#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 512ff13ed66a..a1c7e138dbca 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1801,22 +1801,23 @@ xfs_inactive_ifree(
1801 int error; 1801 int error;
1802 1802
1803 /* 1803 /*
1804 * The ifree transaction might need to allocate blocks for record 1804 * We try to use a per-AG reservation for any block needed by the finobt
1805 * insertion to the finobt. We don't want to fail here at ENOSPC, so 1805 * tree, but as the finobt feature predates the per-AG reservation
1806 * allow ifree to dip into the reserved block pool if necessary. 1806 * support a degraded file system might not have enough space for the
1807 * 1807 * reservation at mount time. In that case try to dip into the reserved
1808 * Freeing large sets of inodes generally means freeing inode chunks, 1808 * pool and pray.
1809 * directory and file data blocks, so this should be relatively safe.
1810 * Only under severe circumstances should it be possible to free enough
1811 * inodes to exhaust the reserve block pool via finobt expansion while
1812 * at the same time not creating free space in the filesystem.
1813 * 1809 *
1814 * Send a warning if the reservation does happen to fail, as the inode 1810 * Send a warning if the reservation does happen to fail, as the inode
1815 * now remains allocated and sits on the unlinked list until the fs is 1811 * now remains allocated and sits on the unlinked list until the fs is
1816 * repaired. 1812 * repaired.
1817 */ 1813 */
1818 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 1814 if (unlikely(mp->m_inotbt_nores)) {
1819 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); 1815 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1816 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1817 &tp);
1818 } else {
1819 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1820 }
1820 if (error) { 1821 if (error) {
1821 if (error == -ENOSPC) { 1822 if (error == -ENOSPC) {
1822 xfs_warn_ratelimited(mp, 1823 xfs_warn_ratelimited(mp,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 819b80b15bfb..1bf878b0492c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -140,6 +140,7 @@ typedef struct xfs_mount {
140 int m_fixedfsid[2]; /* unchanged for life of FS */ 140 int m_fixedfsid[2]; /* unchanged for life of FS */
141 uint m_dmevmask; /* DMI events for this FS */ 141 uint m_dmevmask; /* DMI events for this FS */
142 __uint64_t m_flags; /* global mount flags */ 142 __uint64_t m_flags; /* global mount flags */
143 bool m_inotbt_nores; /* no per-AG finobt resv. */
143 int m_ialloc_inos; /* inodes in inode allocation */ 144 int m_ialloc_inos; /* inodes in inode allocation */
144 int m_ialloc_blks; /* blocks in inode allocation */ 145 int m_ialloc_blks; /* blocks in inode allocation */
145 int m_ialloc_min_blks;/* min blocks in sparse inode 146 int m_ialloc_min_blks;/* min blocks in sparse inode