aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2017-01-25 10:49:35 -0500
committerDarrick J. Wong <darrick.wong@oracle.com>2017-01-25 10:49:35 -0500
commit76d771b4cbe33c581bd6ca2710c120be51172440 (patch)
tree1fa79ad971f2dc9502d80a9bdeacbf88a4d2c732
parent4dfa2b84118fd6c95202ae87e62adf5000ccd4d0 (diff)
xfs: use per-AG reservations for the finobt
Currently we try to rely on the global reserved block pool for block allocations for the free inode btree, but I have customer reports (fairly complex workload, need to find an easier reproducer) where that is not enough as the AG where we free an inode that requires a new finobt block is entirely full. This causes us to cancel a dirty transaction and thus a file system shutdown. I think the right way to guard against this is to treat the finot the same way as the refcount btree and have a per-AG reservations for the possible worst case size of it, and the patch below implements that. Note that this could increase mount times with large finobt trees. In an ideal world we would have added a field for the number of finobt fields to the AGI, similar to what we did for the refcount blocks. We should do add it next time we rev the AGI or AGF format by adding new fields. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c47
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c90
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h3
-rw-r--r--fs/xfs/xfs_inode.c23
-rw-r--r--fs/xfs/xfs_mount.h1
5 files changed, 144 insertions, 20 deletions
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 94234bff40dc..33db69be4832 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -39,6 +39,7 @@
39#include "xfs_rmap_btree.h" 39#include "xfs_rmap_btree.h"
40#include "xfs_btree.h" 40#include "xfs_btree.h"
41#include "xfs_refcount_btree.h" 41#include "xfs_refcount_btree.h"
42#include "xfs_ialloc_btree.h"
42 43
43/* 44/*
44 * Per-AG Block Reservations 45 * Per-AG Block Reservations
@@ -210,6 +211,9 @@ __xfs_ag_resv_init(
210 if (error) { 211 if (error) {
211 trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, 212 trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
212 error, _RET_IP_); 213 error, _RET_IP_);
214 xfs_warn(mp,
215"Per-AG reservation for AG %u failed. Filesystem may run out of space.",
216 pag->pag_agno);
213 return error; 217 return error;
214 } 218 }
215 219
@@ -228,6 +232,8 @@ int
228xfs_ag_resv_init( 232xfs_ag_resv_init(
229 struct xfs_perag *pag) 233 struct xfs_perag *pag)
230{ 234{
235 struct xfs_mount *mp = pag->pag_mount;
236 xfs_agnumber_t agno = pag->pag_agno;
231 xfs_extlen_t ask; 237 xfs_extlen_t ask;
232 xfs_extlen_t used; 238 xfs_extlen_t used;
233 int error = 0; 239 int error = 0;
@@ -236,23 +242,45 @@ xfs_ag_resv_init(
236 if (pag->pag_meta_resv.ar_asked == 0) { 242 if (pag->pag_meta_resv.ar_asked == 0) {
237 ask = used = 0; 243 ask = used = 0;
238 244
239 error = xfs_refcountbt_calc_reserves(pag->pag_mount, 245 error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
240 pag->pag_agno, &ask, &used);
241 if (error) 246 if (error)
242 goto out; 247 goto out;
243 248
244 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, 249 error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
245 ask, used);
246 if (error) 250 if (error)
247 goto out; 251 goto out;
252
253 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
254 ask, used);
255 if (error) {
256 /*
257 * Because we didn't have per-AG reservations when the
258 * finobt feature was added we might not be able to
259 * reserve all needed blocks. Warn and fall back to the
260 * old and potentially buggy code in that case, but
261 * ensure we do have the reservation for the refcountbt.
262 */
263 ask = used = 0;
264
265 mp->m_inotbt_nores = true;
266
267 error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
268 &used);
269 if (error)
270 goto out;
271
272 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
273 ask, used);
274 if (error)
275 goto out;
276 }
248 } 277 }
249 278
250 /* Create the AGFL metadata reservation */ 279 /* Create the AGFL metadata reservation */
251 if (pag->pag_agfl_resv.ar_asked == 0) { 280 if (pag->pag_agfl_resv.ar_asked == 0) {
252 ask = used = 0; 281 ask = used = 0;
253 282
254 error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, 283 error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
255 &ask, &used);
256 if (error) 284 if (error)
257 goto out; 285 goto out;
258 286
@@ -261,9 +289,16 @@ xfs_ag_resv_init(
261 goto out; 289 goto out;
262 } 290 }
263 291
292#ifdef DEBUG
293 /* need to read in the AGF for the ASSERT below to work */
294 error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
295 if (error)
296 return error;
297
264 ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + 298 ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
265 xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= 299 xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
266 pag->pagf_freeblks + pag->pagf_flcount); 300 pag->pagf_freeblks + pag->pagf_flcount);
301#endif
267out: 302out:
268 return error; 303 return error;
269} 304}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 0fd086d03d41..7c471881c9a6 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -82,11 +82,12 @@ xfs_finobt_set_root(
82} 82}
83 83
84STATIC int 84STATIC int
85xfs_inobt_alloc_block( 85__xfs_inobt_alloc_block(
86 struct xfs_btree_cur *cur, 86 struct xfs_btree_cur *cur,
87 union xfs_btree_ptr *start, 87 union xfs_btree_ptr *start,
88 union xfs_btree_ptr *new, 88 union xfs_btree_ptr *new,
89 int *stat) 89 int *stat,
90 enum xfs_ag_resv_type resv)
90{ 91{
91 xfs_alloc_arg_t args; /* block allocation args */ 92 xfs_alloc_arg_t args; /* block allocation args */
92 int error; /* error return value */ 93 int error; /* error return value */
@@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
103 args.maxlen = 1; 104 args.maxlen = 1;
104 args.prod = 1; 105 args.prod = 1;
105 args.type = XFS_ALLOCTYPE_NEAR_BNO; 106 args.type = XFS_ALLOCTYPE_NEAR_BNO;
107 args.resv = resv;
106 108
107 error = xfs_alloc_vextent(&args); 109 error = xfs_alloc_vextent(&args);
108 if (error) { 110 if (error) {
@@ -123,6 +125,27 @@ xfs_inobt_alloc_block(
123} 125}
124 126
125STATIC int 127STATIC int
128xfs_inobt_alloc_block(
129 struct xfs_btree_cur *cur,
130 union xfs_btree_ptr *start,
131 union xfs_btree_ptr *new,
132 int *stat)
133{
134 return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
135}
136
137STATIC int
138xfs_finobt_alloc_block(
139 struct xfs_btree_cur *cur,
140 union xfs_btree_ptr *start,
141 union xfs_btree_ptr *new,
142 int *stat)
143{
144 return __xfs_inobt_alloc_block(cur, start, new, stat,
145 XFS_AG_RESV_METADATA);
146}
147
148STATIC int
126xfs_inobt_free_block( 149xfs_inobt_free_block(
127 struct xfs_btree_cur *cur, 150 struct xfs_btree_cur *cur,
128 struct xfs_buf *bp) 151 struct xfs_buf *bp)
@@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
328 351
329 .dup_cursor = xfs_inobt_dup_cursor, 352 .dup_cursor = xfs_inobt_dup_cursor,
330 .set_root = xfs_finobt_set_root, 353 .set_root = xfs_finobt_set_root,
331 .alloc_block = xfs_inobt_alloc_block, 354 .alloc_block = xfs_finobt_alloc_block,
332 .free_block = xfs_inobt_free_block, 355 .free_block = xfs_inobt_free_block,
333 .get_minrecs = xfs_inobt_get_minrecs, 356 .get_minrecs = xfs_inobt_get_minrecs,
334 .get_maxrecs = xfs_inobt_get_maxrecs, 357 .get_maxrecs = xfs_inobt_get_maxrecs,
@@ -480,3 +503,64 @@ xfs_inobt_rec_check_count(
480 return 0; 503 return 0;
481} 504}
482#endif /* DEBUG */ 505#endif /* DEBUG */
506
507static xfs_extlen_t
508xfs_inobt_max_size(
509 struct xfs_mount *mp)
510{
511 /* Bail out if we're uninitialized, which can happen in mkfs. */
512 if (mp->m_inobt_mxr[0] == 0)
513 return 0;
514
515 return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
516 (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
517 XFS_INODES_PER_CHUNK);
518}
519
520static int
521xfs_inobt_count_blocks(
522 struct xfs_mount *mp,
523 xfs_agnumber_t agno,
524 xfs_btnum_t btnum,
525 xfs_extlen_t *tree_blocks)
526{
527 struct xfs_buf *agbp;
528 struct xfs_btree_cur *cur;
529 int error;
530
531 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
532 if (error)
533 return error;
534
535 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
536 error = xfs_btree_count_blocks(cur, tree_blocks);
537 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
538 xfs_buf_relse(agbp);
539
540 return error;
541}
542
543/*
544 * Figure out how many blocks to reserve and how many are used by this btree.
545 */
546int
547xfs_finobt_calc_reserves(
548 struct xfs_mount *mp,
549 xfs_agnumber_t agno,
550 xfs_extlen_t *ask,
551 xfs_extlen_t *used)
552{
553 xfs_extlen_t tree_len = 0;
554 int error;
555
556 if (!xfs_sb_version_hasfinobt(&mp->m_sb))
557 return 0;
558
559 error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
560 if (error)
561 return error;
562
563 *ask += xfs_inobt_max_size(mp);
564 *used += tree_len;
565 return 0;
566}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index bd88453217ce..aa81e2e63f3f 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
72#define xfs_inobt_rec_check_count(mp, rec) 0 72#define xfs_inobt_rec_check_count(mp, rec) 0
73#endif /* DEBUG */ 73#endif /* DEBUG */
74 74
75int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
76 xfs_extlen_t *ask, xfs_extlen_t *used);
77
75#endif /* __XFS_IALLOC_BTREE_H__ */ 78#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b9557795eb74..de32f0fe47c8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1792,22 +1792,23 @@ xfs_inactive_ifree(
1792 int error; 1792 int error;
1793 1793
1794 /* 1794 /*
1795 * The ifree transaction might need to allocate blocks for record 1795 * We try to use a per-AG reservation for any block needed by the finobt
1796 * insertion to the finobt. We don't want to fail here at ENOSPC, so 1796 * tree, but as the finobt feature predates the per-AG reservation
1797 * allow ifree to dip into the reserved block pool if necessary. 1797 * support a degraded file system might not have enough space for the
1798 * 1798 * reservation at mount time. In that case try to dip into the reserved
1799 * Freeing large sets of inodes generally means freeing inode chunks, 1799 * pool and pray.
1800 * directory and file data blocks, so this should be relatively safe.
1801 * Only under severe circumstances should it be possible to free enough
1802 * inodes to exhaust the reserve block pool via finobt expansion while
1803 * at the same time not creating free space in the filesystem.
1804 * 1800 *
1805 * Send a warning if the reservation does happen to fail, as the inode 1801 * Send a warning if the reservation does happen to fail, as the inode
1806 * now remains allocated and sits on the unlinked list until the fs is 1802 * now remains allocated and sits on the unlinked list until the fs is
1807 * repaired. 1803 * repaired.
1808 */ 1804 */
1809 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 1805 if (unlikely(mp->m_inotbt_nores)) {
1810 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); 1806 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1807 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1808 &tp);
1809 } else {
1810 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1811 }
1811 if (error) { 1812 if (error) {
1812 if (error == -ENOSPC) { 1813 if (error == -ENOSPC) {
1813 xfs_warn_ratelimited(mp, 1814 xfs_warn_ratelimited(mp,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 84f785218907..7f351f706b7a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -140,6 +140,7 @@ typedef struct xfs_mount {
140 int m_fixedfsid[2]; /* unchanged for life of FS */ 140 int m_fixedfsid[2]; /* unchanged for life of FS */
141 uint m_dmevmask; /* DMI events for this FS */ 141 uint m_dmevmask; /* DMI events for this FS */
142 __uint64_t m_flags; /* global mount flags */ 142 __uint64_t m_flags; /* global mount flags */
143 bool m_inotbt_nores; /* no per-AG finobt resv. */
143 int m_ialloc_inos; /* inodes in inode allocation */ 144 int m_ialloc_inos; /* inodes in inode allocation */
144 int m_ialloc_blks; /* blocks in inode allocation */ 145 int m_ialloc_blks; /* blocks in inode allocation */
145 int m_ialloc_min_blks;/* min blocks in sparse inode 146 int m_ialloc_min_blks;/* min blocks in sparse inode