aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2015-05-31 20:51:38 -0400
committerDave Chinner <david@fromorbit.com>2015-05-31 20:51:38 -0400
commitb9a350a1183efd7b63e59b6eaa39abfea908d0be (patch)
tree34144351b97b4dc749b6f6b003821af6c6a30824
parente01c025fbdd5584bc2c8f6b88cb014f5f9bd790f (diff)
parent22ce1e1472fda6ce740cee966bb8e25a3cc662bd (diff)
Merge branch 'xfs-sparse-inode' into for-next
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c42
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_format.h48
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c541
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h12
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c93
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_sb.c30
-rw-r--r--fs/xfs/xfs_fsops.c4
-rw-r--r--fs/xfs/xfs_inode.c28
-rw-r--r--fs/xfs/xfs_itable.c13
-rw-r--r--fs/xfs/xfs_log_recover.c26
-rw-r--r--fs/xfs/xfs_mount.c16
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_trace.h47
16 files changed, 829 insertions, 86 deletions
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 516162be1398..bc78ac08e72e 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -149,13 +149,27 @@ xfs_alloc_compute_aligned(
149{ 149{
150 xfs_agblock_t bno; 150 xfs_agblock_t bno;
151 xfs_extlen_t len; 151 xfs_extlen_t len;
152 xfs_extlen_t diff;
152 153
153 /* Trim busy sections out of found extent */ 154 /* Trim busy sections out of found extent */
154 xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); 155 xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
155 156
157 /*
158 * If we have a largish extent that happens to start before min_agbno,
159 * see if we can shift it into range...
160 */
161 if (bno < args->min_agbno && bno + len > args->min_agbno) {
162 diff = args->min_agbno - bno;
163 if (len > diff) {
164 bno += diff;
165 len -= diff;
166 }
167 }
168
156 if (args->alignment > 1 && len >= args->minlen) { 169 if (args->alignment > 1 && len >= args->minlen) {
157 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 170 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
158 xfs_extlen_t diff = aligned_bno - bno; 171
172 diff = aligned_bno - bno;
159 173
160 *resbno = aligned_bno; 174 *resbno = aligned_bno;
161 *reslen = diff >= len ? 0 : len - diff; 175 *reslen = diff >= len ? 0 : len - diff;
@@ -795,9 +809,13 @@ xfs_alloc_find_best_extent(
795 * The good extent is closer than this one. 809 * The good extent is closer than this one.
796 */ 810 */
797 if (!dir) { 811 if (!dir) {
812 if (*sbnoa > args->max_agbno)
813 goto out_use_good;
798 if (*sbnoa >= args->agbno + gdiff) 814 if (*sbnoa >= args->agbno + gdiff)
799 goto out_use_good; 815 goto out_use_good;
800 } else { 816 } else {
817 if (*sbnoa < args->min_agbno)
818 goto out_use_good;
801 if (*sbnoa <= args->agbno - gdiff) 819 if (*sbnoa <= args->agbno - gdiff)
802 goto out_use_good; 820 goto out_use_good;
803 } 821 }
@@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near(
884 dofirst = prandom_u32() & 1; 902 dofirst = prandom_u32() & 1;
885#endif 903#endif
886 904
905 /* handle unitialized agbno range so caller doesn't have to */
906 if (!args->min_agbno && !args->max_agbno)
907 args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
908 ASSERT(args->min_agbno <= args->max_agbno);
909
910 /* clamp agbno to the range if it's outside */
911 if (args->agbno < args->min_agbno)
912 args->agbno = args->min_agbno;
913 if (args->agbno > args->max_agbno)
914 args->agbno = args->max_agbno;
915
887restart: 916restart:
888 bno_cur_lt = NULL; 917 bno_cur_lt = NULL;
889 bno_cur_gt = NULL; 918 bno_cur_gt = NULL;
@@ -976,6 +1005,8 @@ restart:
976 &ltbnoa, &ltlena); 1005 &ltbnoa, &ltlena);
977 if (ltlena < args->minlen) 1006 if (ltlena < args->minlen)
978 continue; 1007 continue;
1008 if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
1009 continue;
979 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1010 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
980 xfs_alloc_fix_len(args); 1011 xfs_alloc_fix_len(args);
981 ASSERT(args->len >= args->minlen); 1012 ASSERT(args->len >= args->minlen);
@@ -1096,11 +1127,11 @@ restart:
1096 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1127 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1097 xfs_alloc_compute_aligned(args, ltbno, ltlen, 1128 xfs_alloc_compute_aligned(args, ltbno, ltlen,
1098 &ltbnoa, &ltlena); 1129 &ltbnoa, &ltlena);
1099 if (ltlena >= args->minlen) 1130 if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
1100 break; 1131 break;
1101 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) 1132 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
1102 goto error0; 1133 goto error0;
1103 if (!i) { 1134 if (!i || ltbnoa < args->min_agbno) {
1104 xfs_btree_del_cursor(bno_cur_lt, 1135 xfs_btree_del_cursor(bno_cur_lt,
1105 XFS_BTREE_NOERROR); 1136 XFS_BTREE_NOERROR);
1106 bno_cur_lt = NULL; 1137 bno_cur_lt = NULL;
@@ -1112,11 +1143,11 @@ restart:
1112 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1143 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1113 xfs_alloc_compute_aligned(args, gtbno, gtlen, 1144 xfs_alloc_compute_aligned(args, gtbno, gtlen,
1114 &gtbnoa, &gtlena); 1145 &gtbnoa, &gtlena);
1115 if (gtlena >= args->minlen) 1146 if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
1116 break; 1147 break;
1117 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) 1148 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
1118 goto error0; 1149 goto error0;
1119 if (!i) { 1150 if (!i || gtbnoa > args->max_agbno) {
1120 xfs_btree_del_cursor(bno_cur_gt, 1151 xfs_btree_del_cursor(bno_cur_gt,
1121 XFS_BTREE_NOERROR); 1152 XFS_BTREE_NOERROR);
1122 bno_cur_gt = NULL; 1153 bno_cur_gt = NULL;
@@ -1216,6 +1247,7 @@ restart:
1216 ASSERT(ltnew >= ltbno); 1247 ASSERT(ltnew >= ltbno);
1217 ASSERT(ltnew + rlen <= ltbnoa + ltlena); 1248 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1218 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1249 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1250 ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
1219 args->agbno = ltnew; 1251 args->agbno = ltnew;
1220 1252
1221 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, 1253 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d1b4b6a5c894..29f27b272b7f 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg {
112 xfs_extlen_t total; /* total blocks needed in xaction */ 112 xfs_extlen_t total; /* total blocks needed in xaction */
113 xfs_extlen_t alignment; /* align answer to multiple of this */ 113 xfs_extlen_t alignment; /* align answer to multiple of this */
114 xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ 114 xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */
115 xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */
116 xfs_agblock_t max_agbno; /* ... */
115 xfs_extlen_t len; /* output: actual size of extent */ 117 xfs_extlen_t len; /* output: actual size of extent */
116 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ 118 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
117 xfs_alloctype_t otype; /* original allocation type */ 119 xfs_alloctype_t otype; /* original allocation type */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index ff22a4d9ad0c..815f61b02bc1 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -170,7 +170,7 @@ typedef struct xfs_sb {
170 __uint32_t sb_features_log_incompat; 170 __uint32_t sb_features_log_incompat;
171 171
172 __uint32_t sb_crc; /* superblock crc */ 172 __uint32_t sb_crc; /* superblock crc */
173 __uint32_t sb_pad; 173 xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */
174 174
175 xfs_ino_t sb_pquotino; /* project quota inode */ 175 xfs_ino_t sb_pquotino; /* project quota inode */
176 xfs_lsn_t sb_lsn; /* last write sequence */ 176 xfs_lsn_t sb_lsn; /* last write sequence */
@@ -256,7 +256,7 @@ typedef struct xfs_dsb {
256 __be32 sb_features_log_incompat; 256 __be32 sb_features_log_incompat;
257 257
258 __le32 sb_crc; /* superblock crc */ 258 __le32 sb_crc; /* superblock crc */
259 __be32 sb_pad; 259 __be32 sb_spino_align; /* sparse inode chunk alignment */
260 260
261 __be64 sb_pquotino; /* project quota inode */ 261 __be64 sb_pquotino; /* project quota inode */
262 __be64 sb_lsn; /* last write sequence */ 262 __be64 sb_lsn; /* last write sequence */
@@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature(
457} 457}
458 458
459#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ 459#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
460#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
460#define XFS_SB_FEAT_INCOMPAT_ALL \ 461#define XFS_SB_FEAT_INCOMPAT_ALL \
461 (XFS_SB_FEAT_INCOMPAT_FTYPE) 462 (XFS_SB_FEAT_INCOMPAT_FTYPE| \
463 XFS_SB_FEAT_INCOMPAT_SPINODES)
462 464
463#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL 465#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
464static inline bool 466static inline bool
@@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
506 (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); 508 (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
507} 509}
508 510
511static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
512{
513 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
514 xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
515}
516
509/* 517/*
510 * end of superblock version macros 518 * end of superblock version macros
511 */ 519 */
@@ -1216,26 +1224,54 @@ typedef __uint64_t xfs_inofree_t;
1216#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) 1224#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
1217#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) 1225#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
1218 1226
1227#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */
1228#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t))
1229#define XFS_INODES_PER_HOLEMASK_BIT \
1230 (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t)))
1231
1219static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) 1232static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
1220{ 1233{
1221 return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; 1234 return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
1222} 1235}
1223 1236
1224/* 1237/*
1225 * Data record structure 1238 * The on-disk inode record structure has two formats. The original "full"
1239 * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
1240 * and replaces the 3 high-order freecount bytes wth the holemask and inode
1241 * count.
1242 *
1243 * The holemask of the sparse record format allows an inode chunk to have holes
1244 * that refer to blocks not owned by the inode record. This facilitates inode
1245 * allocation in the event of severe free space fragmentation.
1226 */ 1246 */
1227typedef struct xfs_inobt_rec { 1247typedef struct xfs_inobt_rec {
1228 __be32 ir_startino; /* starting inode number */ 1248 __be32 ir_startino; /* starting inode number */
1229 __be32 ir_freecount; /* count of free inodes (set bits) */ 1249 union {
1250 struct {
1251 __be32 ir_freecount; /* count of free inodes */
1252 } f;
1253 struct {
1254 __be16 ir_holemask;/* hole mask for sparse chunks */
1255 __u8 ir_count; /* total inode count */
1256 __u8 ir_freecount; /* count of free inodes */
1257 } sp;
1258 } ir_u;
1230 __be64 ir_free; /* free inode mask */ 1259 __be64 ir_free; /* free inode mask */
1231} xfs_inobt_rec_t; 1260} xfs_inobt_rec_t;
1232 1261
1233typedef struct xfs_inobt_rec_incore { 1262typedef struct xfs_inobt_rec_incore {
1234 xfs_agino_t ir_startino; /* starting inode number */ 1263 xfs_agino_t ir_startino; /* starting inode number */
1235 __int32_t ir_freecount; /* count of free inodes (set bits) */ 1264 __uint16_t ir_holemask; /* hole mask for sparse chunks */
1265 __uint8_t ir_count; /* total inode count */
1266 __uint8_t ir_freecount; /* count of free inodes (set bits) */
1236 xfs_inofree_t ir_free; /* free inode mask */ 1267 xfs_inofree_t ir_free; /* free inode mask */
1237} xfs_inobt_rec_incore_t; 1268} xfs_inobt_rec_incore_t;
1238 1269
1270static inline bool xfs_inobt_issparse(uint16_t holemask)
1271{
1272 /* non-zero holemask represents a sparse rec. */
1273 return holemask;
1274}
1239 1275
1240/* 1276/*
1241 * Key structure 1277 * Key structure
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca19f..89689c6a43e2 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
239#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ 239#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
240#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ 240#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
241#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ 241#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
242#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
242 243
243/* 244/*
244 * Minimum and maximum sizes need for growth checks. 245 * Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 1c9e75521250..a18bc75cc216 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -65,6 +65,8 @@ xfs_inobt_lookup(
65 int *stat) /* success/failure */ 65 int *stat) /* success/failure */
66{ 66{
67 cur->bc_rec.i.ir_startino = ino; 67 cur->bc_rec.i.ir_startino = ino;
68 cur->bc_rec.i.ir_holemask = 0;
69 cur->bc_rec.i.ir_count = 0;
68 cur->bc_rec.i.ir_freecount = 0; 70 cur->bc_rec.i.ir_freecount = 0;
69 cur->bc_rec.i.ir_free = 0; 71 cur->bc_rec.i.ir_free = 0;
70 return xfs_btree_lookup(cur, dir, stat); 72 return xfs_btree_lookup(cur, dir, stat);
@@ -82,7 +84,14 @@ xfs_inobt_update(
82 union xfs_btree_rec rec; 84 union xfs_btree_rec rec;
83 85
84 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); 86 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
85 rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); 87 if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
88 rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
89 rec.inobt.ir_u.sp.ir_count = irec->ir_count;
90 rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
91 } else {
92 /* ir_holemask/ir_count not supported on-disk */
93 rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
94 }
86 rec.inobt.ir_free = cpu_to_be64(irec->ir_free); 95 rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
87 return xfs_btree_update(cur, &rec); 96 return xfs_btree_update(cur, &rec);
88} 97}
@@ -100,12 +109,27 @@ xfs_inobt_get_rec(
100 int error; 109 int error;
101 110
102 error = xfs_btree_get_rec(cur, &rec, stat); 111 error = xfs_btree_get_rec(cur, &rec, stat);
103 if (!error && *stat == 1) { 112 if (error || *stat == 0)
104 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); 113 return error;
105 irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); 114
106 irec->ir_free = be64_to_cpu(rec->inobt.ir_free); 115 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
116 if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
117 irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
118 irec->ir_count = rec->inobt.ir_u.sp.ir_count;
119 irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
120 } else {
121 /*
122 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
123 * values for full inode chunks.
124 */
125 irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
126 irec->ir_count = XFS_INODES_PER_CHUNK;
127 irec->ir_freecount =
128 be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
107 } 129 }
108 return error; 130 irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
131
132 return 0;
109} 133}
110 134
111/* 135/*
@@ -114,10 +138,14 @@ xfs_inobt_get_rec(
114STATIC int 138STATIC int
115xfs_inobt_insert_rec( 139xfs_inobt_insert_rec(
116 struct xfs_btree_cur *cur, 140 struct xfs_btree_cur *cur,
141 __uint16_t holemask,
142 __uint8_t count,
117 __int32_t freecount, 143 __int32_t freecount,
118 xfs_inofree_t free, 144 xfs_inofree_t free,
119 int *stat) 145 int *stat)
120{ 146{
147 cur->bc_rec.i.ir_holemask = holemask;
148 cur->bc_rec.i.ir_count = count;
121 cur->bc_rec.i.ir_freecount = freecount; 149 cur->bc_rec.i.ir_freecount = freecount;
122 cur->bc_rec.i.ir_free = free; 150 cur->bc_rec.i.ir_free = free;
123 return xfs_btree_insert(cur, stat); 151 return xfs_btree_insert(cur, stat);
@@ -154,7 +182,9 @@ xfs_inobt_insert(
154 } 182 }
155 ASSERT(i == 0); 183 ASSERT(i == 0);
156 184
157 error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, 185 error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
186 XFS_INODES_PER_CHUNK,
187 XFS_INODES_PER_CHUNK,
158 XFS_INOBT_ALL_FREE, &i); 188 XFS_INOBT_ALL_FREE, &i);
159 if (error) { 189 if (error) {
160 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 190 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -220,6 +250,7 @@ xfs_ialloc_inode_init(
220 struct xfs_mount *mp, 250 struct xfs_mount *mp,
221 struct xfs_trans *tp, 251 struct xfs_trans *tp,
222 struct list_head *buffer_list, 252 struct list_head *buffer_list,
253 int icount,
223 xfs_agnumber_t agno, 254 xfs_agnumber_t agno,
224 xfs_agblock_t agbno, 255 xfs_agblock_t agbno,
225 xfs_agblock_t length, 256 xfs_agblock_t length,
@@ -275,7 +306,7 @@ xfs_ialloc_inode_init(
275 * they track in the AIL as if they were physically logged. 306 * they track in the AIL as if they were physically logged.
276 */ 307 */
277 if (tp) 308 if (tp)
278 xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, 309 xfs_icreate_log(tp, agno, agbno, icount,
279 mp->m_sb.sb_inodesize, length, gen); 310 mp->m_sb.sb_inodesize, length, gen);
280 } else 311 } else
281 version = 2; 312 version = 2;
@@ -347,6 +378,214 @@ xfs_ialloc_inode_init(
347} 378}
348 379
349/* 380/*
381 * Align startino and allocmask for a recently allocated sparse chunk such that
382 * they are fit for insertion (or merge) into the on-disk inode btrees.
383 *
384 * Background:
385 *
386 * When enabled, sparse inode support increases the inode alignment from cluster
387 * size to inode chunk size. This means that the minimum range between two
388 * non-adjacent inode records in the inobt is large enough for a full inode
389 * record. This allows for cluster sized, cluster aligned block allocation
390 * without need to worry about whether the resulting inode record overlaps with
391 * another record in the tree. Without this basic rule, we would have to deal
392 * with the consequences of overlap by potentially undoing recent allocations in
393 * the inode allocation codepath.
394 *
395 * Because of this alignment rule (which is enforced on mount), there are two
396 * inobt possibilities for newly allocated sparse chunks. One is that the
397 * aligned inode record for the chunk covers a range of inodes not already
398 * covered in the inobt (i.e., it is safe to insert a new sparse record). The
399 * other is that a record already exists at the aligned startino that considers
400 * the newly allocated range as sparse. In the latter case, record content is
401 * merged in hope that sparse inode chunks fill to full chunks over time.
402 */
403STATIC void
404xfs_align_sparse_ino(
405 struct xfs_mount *mp,
406 xfs_agino_t *startino,
407 uint16_t *allocmask)
408{
409 xfs_agblock_t agbno;
410 xfs_agblock_t mod;
411 int offset;
412
413 agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
414 mod = agbno % mp->m_sb.sb_inoalignmt;
415 if (!mod)
416 return;
417
418 /* calculate the inode offset and align startino */
419 offset = mod << mp->m_sb.sb_inopblog;
420 *startino -= offset;
421
422 /*
423 * Since startino has been aligned down, left shift allocmask such that
424 * it continues to represent the same physical inodes relative to the
425 * new startino.
426 */
427 *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
428}
429
430/*
431 * Determine whether the source inode record can merge into the target. Both
432 * records must be sparse, the inode ranges must match and there must be no
433 * allocation overlap between the records.
434 */
435STATIC bool
436__xfs_inobt_can_merge(
437 struct xfs_inobt_rec_incore *trec, /* tgt record */
438 struct xfs_inobt_rec_incore *srec) /* src record */
439{
440 uint64_t talloc;
441 uint64_t salloc;
442
443 /* records must cover the same inode range */
444 if (trec->ir_startino != srec->ir_startino)
445 return false;
446
447 /* both records must be sparse */
448 if (!xfs_inobt_issparse(trec->ir_holemask) ||
449 !xfs_inobt_issparse(srec->ir_holemask))
450 return false;
451
452 /* both records must track some inodes */
453 if (!trec->ir_count || !srec->ir_count)
454 return false;
455
456 /* can't exceed capacity of a full record */
457 if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
458 return false;
459
460 /* verify there is no allocation overlap */
461 talloc = xfs_inobt_irec_to_allocmask(trec);
462 salloc = xfs_inobt_irec_to_allocmask(srec);
463 if (talloc & salloc)
464 return false;
465
466 return true;
467}
468
469/*
470 * Merge the source inode record into the target. The caller must call
471 * __xfs_inobt_can_merge() to ensure the merge is valid.
472 */
473STATIC void
474__xfs_inobt_rec_merge(
475 struct xfs_inobt_rec_incore *trec, /* target */
476 struct xfs_inobt_rec_incore *srec) /* src */
477{
478 ASSERT(trec->ir_startino == srec->ir_startino);
479
480 /* combine the counts */
481 trec->ir_count += srec->ir_count;
482 trec->ir_freecount += srec->ir_freecount;
483
484 /*
485 * Merge the holemask and free mask. For both fields, 0 bits refer to
486 * allocated inodes. We combine the allocated ranges with bitwise AND.
487 */
488 trec->ir_holemask &= srec->ir_holemask;
489 trec->ir_free &= srec->ir_free;
490}
491
492/*
493 * Insert a new sparse inode chunk into the associated inode btree. The inode
494 * record for the sparse chunk is pre-aligned to a startino that should match
495 * any pre-existing sparse inode record in the tree. This allows sparse chunks
496 * to fill over time.
497 *
498 * This function supports two modes of handling preexisting records depending on
499 * the merge flag. If merge is true, the provided record is merged with the
500 * existing record and updated in place. The merged record is returned in nrec.
501 * If merge is false, an existing record is replaced with the provided record.
502 * If no preexisting record exists, the provided record is always inserted.
503 *
504 * It is considered corruption if a merge is requested and not possible. Given
505 * the sparse inode alignment constraints, this should never happen.
506 */
507STATIC int
508xfs_inobt_insert_sprec(
509 struct xfs_mount *mp,
510 struct xfs_trans *tp,
511 struct xfs_buf *agbp,
512 int btnum,
513 struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
514 bool merge) /* merge or replace */
515{
516 struct xfs_btree_cur *cur;
517 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
518 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
519 int error;
520 int i;
521 struct xfs_inobt_rec_incore rec;
522
523 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
524
525 /* the new record is pre-aligned so we know where to look */
526 error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
527 if (error)
528 goto error;
529 /* if nothing there, insert a new record and return */
530 if (i == 0) {
531 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
532 nrec->ir_count, nrec->ir_freecount,
533 nrec->ir_free, &i);
534 if (error)
535 goto error;
536 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
537
538 goto out;
539 }
540
541 /*
542 * A record exists at this startino. Merge or replace the record
543 * depending on what we've been asked to do.
544 */
545 if (merge) {
546 error = xfs_inobt_get_rec(cur, &rec, &i);
547 if (error)
548 goto error;
549 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
550 XFS_WANT_CORRUPTED_GOTO(mp,
551 rec.ir_startino == nrec->ir_startino,
552 error);
553
554 /*
555 * This should never fail. If we have coexisting records that
556 * cannot merge, something is seriously wrong.
557 */
558 XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
559 error);
560
561 trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
562 rec.ir_holemask, nrec->ir_startino,
563 nrec->ir_holemask);
564
565 /* merge to nrec to output the updated record */
566 __xfs_inobt_rec_merge(nrec, &rec);
567
568 trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
569 nrec->ir_holemask);
570
571 error = xfs_inobt_rec_check_count(mp, nrec);
572 if (error)
573 goto error;
574 }
575
576 error = xfs_inobt_update(cur, nrec);
577 if (error)
578 goto error;
579
580out:
581 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
582 return 0;
583error:
584 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
585 return error;
586}
587
588/*
350 * Allocate new inodes in the allocation group specified by agbp. 589 * Allocate new inodes in the allocation group specified by agbp.
351 * Return 0 for success, else error code. 590 * Return 0 for success, else error code.
352 */ 591 */
@@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc(
364 xfs_agino_t newlen; /* new number of inodes */ 603 xfs_agino_t newlen; /* new number of inodes */
365 int isaligned = 0; /* inode allocation at stripe unit */ 604 int isaligned = 0; /* inode allocation at stripe unit */
366 /* boundary */ 605 /* boundary */
606 uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
607 struct xfs_inobt_rec_incore rec;
367 struct xfs_perag *pag; 608 struct xfs_perag *pag;
368 609
610 int do_sparse = 0;
611
612#ifdef DEBUG
613 /* randomly do sparse inode allocations */
614 if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb))
615 do_sparse = prandom_u32() & 1;
616#endif
617
369 memset(&args, 0, sizeof(args)); 618 memset(&args, 0, sizeof(args));
370 args.tp = tp; 619 args.tp = tp;
371 args.mp = tp->t_mountp; 620 args.mp = tp->t_mountp;
621 args.fsbno = NULLFSBLOCK;
372 622
373 /* 623 /*
374 * Locking will ensure that we don't have two callers in here 624 * Locking will ensure that we don't have two callers in here
@@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc(
390 agno = be32_to_cpu(agi->agi_seqno); 640 agno = be32_to_cpu(agi->agi_seqno);
391 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + 641 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
392 args.mp->m_ialloc_blks; 642 args.mp->m_ialloc_blks;
643 if (do_sparse)
644 goto sparse_alloc;
393 if (likely(newino != NULLAGINO && 645 if (likely(newino != NULLAGINO &&
394 (args.agbno < be32_to_cpu(agi->agi_length)))) { 646 (args.agbno < be32_to_cpu(agi->agi_length)))) {
395 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); 647 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc(
428 * subsequent requests. 680 * subsequent requests.
429 */ 681 */
430 args.minalignslop = 0; 682 args.minalignslop = 0;
431 } else 683 }
432 args.fsbno = NULLFSBLOCK;
433 684
434 if (unlikely(args.fsbno == NULLFSBLOCK)) { 685 if (unlikely(args.fsbno == NULLFSBLOCK)) {
435 /* 686 /*
@@ -480,6 +731,46 @@ xfs_ialloc_ag_alloc(
480 return error; 731 return error;
481 } 732 }
482 733
734 /*
735 * Finally, try a sparse allocation if the filesystem supports it and
736 * the sparse allocation length is smaller than a full chunk.
737 */
738 if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
739 args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
740 args.fsbno == NULLFSBLOCK) {
741sparse_alloc:
742 args.type = XFS_ALLOCTYPE_NEAR_BNO;
743 args.agbno = be32_to_cpu(agi->agi_root);
744 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
745 args.alignment = args.mp->m_sb.sb_spino_align;
746 args.prod = 1;
747
748 args.minlen = args.mp->m_ialloc_min_blks;
749 args.maxlen = args.minlen;
750
751 /*
752 * The inode record will be aligned to full chunk size. We must
753 * prevent sparse allocation from AG boundaries that result in
754 * invalid inode records, such as records that start at agbno 0
755 * or extend beyond the AG.
756 *
757 * Set min agbno to the first aligned, non-zero agbno and max to
758 * the last aligned agbno that is at least one full chunk from
759 * the end of the AG.
760 */
761 args.min_agbno = args.mp->m_sb.sb_inoalignmt;
762 args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
763 args.mp->m_sb.sb_inoalignmt) -
764 args.mp->m_ialloc_blks;
765
766 error = xfs_alloc_vextent(&args);
767 if (error)
768 return error;
769
770 newlen = args.len << args.mp->m_sb.sb_inopblog;
771 allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
772 }
773
483 if (args.fsbno == NULLFSBLOCK) { 774 if (args.fsbno == NULLFSBLOCK) {
484 *alloc = 0; 775 *alloc = 0;
485 return 0; 776 return 0;
@@ -495,8 +786,8 @@ xfs_ialloc_ag_alloc(
495 * rather than a linear progression to prevent the next generation 786 * rather than a linear progression to prevent the next generation
496 * number from being easily guessable. 787 * number from being easily guessable.
497 */ 788 */
498 error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, 789 error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
499 args.len, prandom_u32()); 790 args.agbno, args.len, prandom_u32());
500 791
501 if (error) 792 if (error)
502 return error; 793 return error;
@@ -504,6 +795,73 @@ xfs_ialloc_ag_alloc(
504 * Convert the results. 795 * Convert the results.
505 */ 796 */
506 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); 797 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
798
799 if (xfs_inobt_issparse(~allocmask)) {
800 /*
801 * We've allocated a sparse chunk. Align the startino and mask.
802 */
803 xfs_align_sparse_ino(args.mp, &newino, &allocmask);
804
805 rec.ir_startino = newino;
806 rec.ir_holemask = ~allocmask;
807 rec.ir_count = newlen;
808 rec.ir_freecount = newlen;
809 rec.ir_free = XFS_INOBT_ALL_FREE;
810
811 /*
812 * Insert the sparse record into the inobt and allow for a merge
813 * if necessary. If a merge does occur, rec is updated to the
814 * merged record.
815 */
816 error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
817 &rec, true);
818 if (error == -EFSCORRUPTED) {
819 xfs_alert(args.mp,
820 "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
821 XFS_AGINO_TO_INO(args.mp, agno,
822 rec.ir_startino),
823 rec.ir_holemask, rec.ir_count);
824 xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
825 }
826 if (error)
827 return error;
828
829 /*
830 * We can't merge the part we've just allocated as for the inobt
831 * due to finobt semantics. The original record may or may not
832 * exist independent of whether physical inodes exist in this
833 * sparse chunk.
834 *
835 * We must update the finobt record based on the inobt record.
836 * rec contains the fully merged and up to date inobt record
837 * from the previous call. Set merge false to replace any
838 * existing record with this one.
839 */
840 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
841 error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
842 XFS_BTNUM_FINO, &rec,
843 false);
844 if (error)
845 return error;
846 }
847 } else {
848 /* full chunk - insert new records to both btrees */
849 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
850 XFS_BTNUM_INO);
851 if (error)
852 return error;
853
854 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
855 error = xfs_inobt_insert(args.mp, tp, agbp, newino,
856 newlen, XFS_BTNUM_FINO);
857 if (error)
858 return error;
859 }
860 }
861
862 /*
863 * Update AGI counts and newino.
864 */
507 be32_add_cpu(&agi->agi_count, newlen); 865 be32_add_cpu(&agi->agi_count, newlen);
508 be32_add_cpu(&agi->agi_freecount, newlen); 866 be32_add_cpu(&agi->agi_freecount, newlen);
509 pag = xfs_perag_get(args.mp, agno); 867 pag = xfs_perag_get(args.mp, agno);
@@ -512,20 +870,6 @@ xfs_ialloc_ag_alloc(
512 agi->agi_newino = cpu_to_be32(newino); 870 agi->agi_newino = cpu_to_be32(newino);
513 871
514 /* 872 /*
515 * Insert records describing the new inode chunk into the btrees.
516 */
517 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
518 XFS_BTNUM_INO);
519 if (error)
520 return error;
521
522 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
523 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
524 XFS_BTNUM_FINO);
525 if (error)
526 return error;
527 }
528 /*
529 * Log allocation group header fields 873 * Log allocation group header fields
530 */ 874 */
531 xfs_ialloc_log_agi(tp, agbp, 875 xfs_ialloc_log_agi(tp, agbp,
@@ -645,7 +989,7 @@ xfs_ialloc_ag_select(
645 * if we fail allocation due to alignment issues then it is most 989 * if we fail allocation due to alignment issues then it is most
646 * likely a real ENOSPC condition. 990 * likely a real ENOSPC condition.
647 */ 991 */
648 ineed = mp->m_ialloc_blks; 992 ineed = mp->m_ialloc_min_blks;
649 if (flags && ineed > 1) 993 if (flags && ineed > 1)
650 ineed += xfs_ialloc_cluster_alignment(mp); 994 ineed += xfs_ialloc_cluster_alignment(mp);
651 longest = pag->pagf_longest; 995 longest = pag->pagf_longest;
@@ -732,6 +1076,27 @@ xfs_ialloc_get_rec(
732} 1076}
733 1077
734/* 1078/*
1079 * Return the offset of the first free inode in the record. If the inode chunk
1080 * is sparsely allocated, we convert the record holemask to inode granularity
1081 * and mask off the unallocated regions from the inode free mask.
1082 */
1083STATIC int
1084xfs_inobt_first_free_inode(
1085 struct xfs_inobt_rec_incore *rec)
1086{
1087 xfs_inofree_t realfree;
1088
1089 /* if there are no holes, return the first available offset */
1090 if (!xfs_inobt_issparse(rec->ir_holemask))
1091 return xfs_lowbit64(rec->ir_free);
1092
1093 realfree = xfs_inobt_irec_to_allocmask(rec);
1094 realfree &= rec->ir_free;
1095
1096 return xfs_lowbit64(realfree);
1097}
1098
1099/*
735 * Allocate an inode using the inobt-only algorithm. 1100 * Allocate an inode using the inobt-only algorithm.
736 */ 1101 */
737STATIC int 1102STATIC int
@@ -961,7 +1326,7 @@ newino:
961 } 1326 }
962 1327
963alloc_inode: 1328alloc_inode:
964 offset = xfs_lowbit64(rec.ir_free); 1329 offset = xfs_inobt_first_free_inode(&rec);
965 ASSERT(offset >= 0); 1330 ASSERT(offset >= 0);
966 ASSERT(offset < XFS_INODES_PER_CHUNK); 1331 ASSERT(offset < XFS_INODES_PER_CHUNK);
967 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % 1332 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1210,7 +1575,7 @@ xfs_dialloc_ag(
1210 if (error) 1575 if (error)
1211 goto error_cur; 1576 goto error_cur;
1212 1577
1213 offset = xfs_lowbit64(rec.ir_free); 1578 offset = xfs_inobt_first_free_inode(&rec);
1214 ASSERT(offset >= 0); 1579 ASSERT(offset >= 0);
1215 ASSERT(offset < XFS_INODES_PER_CHUNK); 1580 ASSERT(offset < XFS_INODES_PER_CHUNK);
1216 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % 1581 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1439,6 +1804,83 @@ out_error:
1439 return error; 1804 return error;
1440} 1805}
1441 1806
1807/*
1808 * Free the blocks of an inode chunk. We must consider that the inode chunk
1809 * might be sparse and only free the regions that are allocated as part of the
1810 * chunk.
1811 */
1812STATIC void
1813xfs_difree_inode_chunk(
1814 struct xfs_mount *mp,
1815 xfs_agnumber_t agno,
1816 struct xfs_inobt_rec_incore *rec,
1817 struct xfs_bmap_free *flist)
1818{
1819 xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
1820 int startidx, endidx;
1821 int nextbit;
1822 xfs_agblock_t agbno;
1823 int contigblk;
1824 DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
1825
1826 if (!xfs_inobt_issparse(rec->ir_holemask)) {
1827 /* not sparse, calculate extent info directly */
1828 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
1829 XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
1830 mp->m_ialloc_blks, flist, mp);
1831 return;
1832 }
1833
1834 /* holemask is only 16-bits (fits in an unsigned long) */
1835 ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
1836 holemask[0] = rec->ir_holemask;
1837
1838 /*
1839 * Find contiguous ranges of zeroes (i.e., allocated regions) in the
1840 * holemask and convert the start/end index of each range to an extent.
1841 * We start with the start and end index both pointing at the first 0 in
1842 * the mask.
1843 */
1844 startidx = endidx = find_first_zero_bit(holemask,
1845 XFS_INOBT_HOLEMASK_BITS);
1846 nextbit = startidx + 1;
1847 while (startidx < XFS_INOBT_HOLEMASK_BITS) {
1848 nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
1849 nextbit);
1850 /*
1851 * If the next zero bit is contiguous, update the end index of
1852 * the current range and continue.
1853 */
1854 if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
1855 nextbit == endidx + 1) {
1856 endidx = nextbit;
1857 goto next;
1858 }
1859
1860 /*
1861 * nextbit is not contiguous with the current end index. Convert
1862 * the current start/end to an extent and add it to the free
1863 * list.
1864 */
1865 agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
1866 mp->m_sb.sb_inopblock;
1867 contigblk = ((endidx - startidx + 1) *
1868 XFS_INODES_PER_HOLEMASK_BIT) /
1869 mp->m_sb.sb_inopblock;
1870
1871 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
1872 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
1873 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
1874 flist, mp);
1875
1876 /* reset range to current bit and carry on... */
1877 startidx = endidx = nextbit;
1878
1879next:
1880 nextbit++;
1881 }
1882}
1883
1442STATIC int 1884STATIC int
1443xfs_difree_inobt( 1885xfs_difree_inobt(
1444 struct xfs_mount *mp, 1886 struct xfs_mount *mp,
@@ -1446,8 +1888,7 @@ xfs_difree_inobt(
1446 struct xfs_buf *agbp, 1888 struct xfs_buf *agbp,
1447 xfs_agino_t agino, 1889 xfs_agino_t agino,
1448 struct xfs_bmap_free *flist, 1890 struct xfs_bmap_free *flist,
1449 int *deleted, 1891 struct xfs_icluster *xic,
1450 xfs_ino_t *first_ino,
1451 struct xfs_inobt_rec_incore *orec) 1892 struct xfs_inobt_rec_incore *orec)
1452{ 1893{
1453 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); 1894 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
@@ -1501,20 +1942,23 @@ xfs_difree_inobt(
1501 rec.ir_freecount++; 1942 rec.ir_freecount++;
1502 1943
1503 /* 1944 /*
1504 * When an inode cluster is free, it becomes eligible for removal 1945 * When an inode chunk is free, it becomes eligible for removal. Don't
1946 * remove the chunk if the block size is large enough for multiple inode
1947 * chunks (that might not be free).
1505 */ 1948 */
1506 if (!(mp->m_flags & XFS_MOUNT_IKEEP) && 1949 if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
1507 (rec.ir_freecount == mp->m_ialloc_inos)) { 1950 rec.ir_free == XFS_INOBT_ALL_FREE &&
1508 1951 mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
1509 *deleted = 1; 1952 xic->deleted = 1;
1510 *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); 1953 xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
1954 xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
1511 1955
1512 /* 1956 /*
1513 * Remove the inode cluster from the AGI B+Tree, adjust the 1957 * Remove the inode cluster from the AGI B+Tree, adjust the
1514 * AGI and Superblock inode counts, and mark the disk space 1958 * AGI and Superblock inode counts, and mark the disk space
1515 * to be freed when the transaction is committed. 1959 * to be freed when the transaction is committed.
1516 */ 1960 */
1517 ilen = mp->m_ialloc_inos; 1961 ilen = rec.ir_freecount;
1518 be32_add_cpu(&agi->agi_count, -ilen); 1962 be32_add_cpu(&agi->agi_count, -ilen);
1519 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 1963 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1520 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); 1964 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1530,11 +1974,9 @@ xfs_difree_inobt(
1530 goto error0; 1974 goto error0;
1531 } 1975 }
1532 1976
1533 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, 1977 xfs_difree_inode_chunk(mp, agno, &rec, flist);
1534 XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
1535 mp->m_ialloc_blks, flist, mp);
1536 } else { 1978 } else {
1537 *deleted = 0; 1979 xic->deleted = 0;
1538 1980
1539 error = xfs_inobt_update(cur, &rec); 1981 error = xfs_inobt_update(cur, &rec);
1540 if (error) { 1982 if (error) {
@@ -1599,7 +2041,9 @@ xfs_difree_finobt(
1599 */ 2041 */
1600 XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); 2042 XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
1601 2043
1602 error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, 2044 error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2045 ibtrec->ir_count,
2046 ibtrec->ir_freecount,
1603 ibtrec->ir_free, &i); 2047 ibtrec->ir_free, &i);
1604 if (error) 2048 if (error)
1605 goto error; 2049 goto error;
@@ -1634,8 +2078,13 @@ xfs_difree_finobt(
1634 * free inode. Hence, if all of the inodes are free and we aren't 2078 * free inode. Hence, if all of the inodes are free and we aren't
1635 * keeping inode chunks permanently on disk, remove the record. 2079 * keeping inode chunks permanently on disk, remove the record.
1636 * Otherwise, update the record with the new information. 2080 * Otherwise, update the record with the new information.
2081 *
2082 * Note that we currently can't free chunks when the block size is large
2083 * enough for multiple chunks. Leave the finobt record to remain in sync
2084 * with the inobt.
1637 */ 2085 */
1638 if (rec.ir_freecount == mp->m_ialloc_inos && 2086 if (rec.ir_free == XFS_INOBT_ALL_FREE &&
2087 mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
1639 !(mp->m_flags & XFS_MOUNT_IKEEP)) { 2088 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
1640 error = xfs_btree_delete(cur, &i); 2089 error = xfs_btree_delete(cur, &i);
1641 if (error) 2090 if (error)
@@ -1671,8 +2120,7 @@ xfs_difree(
1671 struct xfs_trans *tp, /* transaction pointer */ 2120 struct xfs_trans *tp, /* transaction pointer */
1672 xfs_ino_t inode, /* inode to be freed */ 2121 xfs_ino_t inode, /* inode to be freed */
1673 struct xfs_bmap_free *flist, /* extents to free */ 2122 struct xfs_bmap_free *flist, /* extents to free */
1674 int *deleted,/* set if inode cluster was deleted */ 2123 struct xfs_icluster *xic) /* cluster info if deleted */
1675 xfs_ino_t *first_ino)/* first inode in deleted cluster */
1676{ 2124{
1677 /* REFERENCED */ 2125 /* REFERENCED */
1678 xfs_agblock_t agbno; /* block number containing inode */ 2126 xfs_agblock_t agbno; /* block number containing inode */
@@ -1723,8 +2171,7 @@ xfs_difree(
1723 /* 2171 /*
1724 * Fix up the inode allocation btree. 2172 * Fix up the inode allocation btree.
1725 */ 2173 */
1726 error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, 2174 error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
1727 &rec);
1728 if (error) 2175 if (error)
1729 goto error0; 2176 goto error0;
1730 2177
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 100007d56449..12401fea7bff 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -28,6 +28,13 @@ struct xfs_btree_cur;
28/* Move inodes in clusters of this size */ 28/* Move inodes in clusters of this size */
29#define XFS_INODE_BIG_CLUSTER_SIZE 8192 29#define XFS_INODE_BIG_CLUSTER_SIZE 8192
30 30
31struct xfs_icluster {
32 bool deleted; /* record is deleted */
33 xfs_ino_t first_ino; /* first inode number */
34 uint64_t alloc; /* inode phys. allocation bitmap for
35 * sparse chunks */
36};
37
31/* Calculate and return the number of filesystem blocks per inode cluster */ 38/* Calculate and return the number of filesystem blocks per inode cluster */
32static inline int 39static inline int
33xfs_icluster_size_fsb( 40xfs_icluster_size_fsb(
@@ -90,8 +97,7 @@ xfs_difree(
90 struct xfs_trans *tp, /* transaction pointer */ 97 struct xfs_trans *tp, /* transaction pointer */
91 xfs_ino_t inode, /* inode to be freed */ 98 xfs_ino_t inode, /* inode to be freed */
92 struct xfs_bmap_free *flist, /* extents to free */ 99 struct xfs_bmap_free *flist, /* extents to free */
93 int *deleted, /* set if inode cluster was deleted */ 100 struct xfs_icluster *ifree); /* cluster info if deleted */
94 xfs_ino_t *first_ino); /* first inode in deleted cluster */
95 101
96/* 102/*
97 * Return the location of the inode in imap, for mapping it into a buffer. 103 * Return the location of the inode in imap, for mapping it into a buffer.
@@ -156,7 +162,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
156 * Inode chunk initialisation routine 162 * Inode chunk initialisation routine
157 */ 163 */
158int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, 164int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
159 struct list_head *buffer_list, 165 struct list_head *buffer_list, int icount,
160 xfs_agnumber_t agno, xfs_agblock_t agbno, 166 xfs_agnumber_t agno, xfs_agblock_t agbno,
161 xfs_agblock_t length, unsigned int gen); 167 xfs_agblock_t length, unsigned int gen);
162 168
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 964c465ca69c..674ad8f760be 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur(
167 union xfs_btree_rec *rec) 167 union xfs_btree_rec *rec)
168{ 168{
169 rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); 169 rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
170 rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); 170 if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
171 rec->inobt.ir_u.sp.ir_holemask =
172 cpu_to_be16(cur->bc_rec.i.ir_holemask);
173 rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
174 rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
175 } else {
176 /* ir_holemask/ir_count not supported on-disk */
177 rec->inobt.ir_u.f.ir_freecount =
178 cpu_to_be32(cur->bc_rec.i.ir_freecount);
179 }
171 rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); 180 rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
172} 181}
173 182
@@ -418,3 +427,85 @@ xfs_inobt_maxrecs(
418 return blocklen / sizeof(xfs_inobt_rec_t); 427 return blocklen / sizeof(xfs_inobt_rec_t);
419 return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); 428 return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
420} 429}
430
431/*
432 * Convert the inode record holemask to an inode allocation bitmap. The inode
433 * allocation bitmap is inode granularity and specifies whether an inode is
434 * physically allocated on disk (not whether the inode is considered allocated
435 * or free by the fs).
436 *
437 * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
438 */
439uint64_t
440xfs_inobt_irec_to_allocmask(
441 struct xfs_inobt_rec_incore *rec)
442{
443 uint64_t bitmap = 0;
444 uint64_t inodespbit;
445 int nextbit;
446 uint allocbitmap;
447
448 /*
449 * The holemask has 16-bits for a 64 inode record. Therefore each
450 * holemask bit represents multiple inodes. Create a mask of bits to set
451 * in the allocmask for each holemask bit.
452 */
453 inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
454
455 /*
456 * Allocated inodes are represented by 0 bits in holemask. Invert the 0
457 * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
458 * anything beyond the 16 holemask bits since this casts to a larger
459 * type.
460 */
461 allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
462
463 /*
464 * allocbitmap is the inverted holemask so every set bit represents
465 * allocated inodes. To expand from 16-bit holemask granularity to
466 * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
467 * bitmap for every holemask bit.
468 */
469 nextbit = xfs_next_bit(&allocbitmap, 1, 0);
470 while (nextbit != -1) {
471 ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
472
473 bitmap |= (inodespbit <<
474 (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
475
476 nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
477 }
478
479 return bitmap;
480}
481
482#if defined(DEBUG) || defined(XFS_WARN)
483/*
484 * Verify that an in-core inode record has a valid inode count.
485 */
486int
487xfs_inobt_rec_check_count(
488 struct xfs_mount *mp,
489 struct xfs_inobt_rec_incore *rec)
490{
491 int inocount = 0;
492 int nextbit = 0;
493 uint64_t allocbmap;
494 int wordsz;
495
496 wordsz = sizeof(allocbmap) / sizeof(unsigned int);
497 allocbmap = xfs_inobt_irec_to_allocmask(rec);
498
499 nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
500 while (nextbit != -1) {
501 inocount++;
502 nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
503 nextbit + 1);
504 }
505
506 if (inocount != rec->ir_count)
507 return -EFSCORRUPTED;
508
509 return 0;
510}
511#endif /* DEBUG */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c2d0..bd88453217ce 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
62 xfs_btnum_t); 62 xfs_btnum_t);
63extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); 63extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
64 64
65/* ir_holemask to inode allocation bitmap conversion */
66uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
67
68#if defined(DEBUG) || defined(XFS_WARN)
69int xfs_inobt_rec_check_count(struct xfs_mount *,
70 struct xfs_inobt_rec_incore *);
71#else
72#define xfs_inobt_rec_check_count(mp, rec) 0
73#endif /* DEBUG */
74
65#endif /* __XFS_IALLOC_BTREE_H__ */ 75#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index dc4bfc5d88fc..019dc324a146 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -174,6 +174,27 @@ xfs_mount_validate_sb(
174 return -EFSCORRUPTED; 174 return -EFSCORRUPTED;
175 } 175 }
176 176
177 /*
178 * Full inode chunks must be aligned to inode chunk size when
179 * sparse inodes are enabled to support the sparse chunk
180 * allocation algorithm and prevent overlapping inode records.
181 */
182 if (xfs_sb_version_hassparseinodes(sbp)) {
183 uint32_t align;
184
185 xfs_alert(mp,
186 "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
187
188 align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
189 >> sbp->sb_blocklog;
190 if (sbp->sb_inoalignmt != align) {
191 xfs_warn(mp,
192"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
193 sbp->sb_inoalignmt, align);
194 return -EINVAL;
195 }
196 }
197
177 if (unlikely( 198 if (unlikely(
178 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 199 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
179 xfs_warn(mp, 200 xfs_warn(mp,
@@ -374,7 +395,7 @@ __xfs_sb_from_disk(
374 be32_to_cpu(from->sb_features_log_incompat); 395 be32_to_cpu(from->sb_features_log_incompat);
375 /* crc is only used on disk, not in memory; just init to 0 here. */ 396 /* crc is only used on disk, not in memory; just init to 0 here. */
376 to->sb_crc = 0; 397 to->sb_crc = 0;
377 to->sb_pad = 0; 398 to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
378 to->sb_pquotino = be64_to_cpu(from->sb_pquotino); 399 to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
379 to->sb_lsn = be64_to_cpu(from->sb_lsn); 400 to->sb_lsn = be64_to_cpu(from->sb_lsn);
380 /* Convert on-disk flags to in-memory flags? */ 401 /* Convert on-disk flags to in-memory flags? */
@@ -516,7 +537,7 @@ xfs_sb_to_disk(
516 cpu_to_be32(from->sb_features_incompat); 537 cpu_to_be32(from->sb_features_incompat);
517 to->sb_features_log_incompat = 538 to->sb_features_log_incompat =
518 cpu_to_be32(from->sb_features_log_incompat); 539 cpu_to_be32(from->sb_features_log_incompat);
519 to->sb_pad = 0; 540 to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
520 to->sb_lsn = cpu_to_be64(from->sb_lsn); 541 to->sb_lsn = cpu_to_be64(from->sb_lsn);
521 } 542 }
522} 543}
@@ -689,6 +710,11 @@ xfs_sb_mount_common(
689 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, 710 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
690 sbp->sb_inopblock); 711 sbp->sb_inopblock);
691 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; 712 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
713
714 if (sbp->sb_spino_align)
715 mp->m_ialloc_min_blks = sbp->sb_spino_align;
716 else
717 mp->m_ialloc_min_blks = mp->m_ialloc_blks;
692} 718}
693 719
694/* 720/*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cb7e8a29dfb6..4bd6463cd931 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -101,7 +101,9 @@ xfs_fs_geometry(
101 (xfs_sb_version_hasftype(&mp->m_sb) ? 101 (xfs_sb_version_hasftype(&mp->m_sb) ?
102 XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | 102 XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
103 (xfs_sb_version_hasfinobt(&mp->m_sb) ? 103 (xfs_sb_version_hasfinobt(&mp->m_sb) ?
104 XFS_FSOP_GEOM_FLAGS_FINOBT : 0); 104 XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
105 (xfs_sb_version_hassparseinodes(&mp->m_sb) ?
106 XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
105 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? 107 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
106 mp->m_sb.sb_logsectsize : BBSIZE; 108 mp->m_sb.sb_logsectsize : BBSIZE;
107 geo->rtsectsize = mp->m_sb.sb_blocksize; 109 geo->rtsectsize = mp->m_sb.sb_blocksize;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 539a85fddbc2..4c054f6634b9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2235,9 +2235,9 @@ xfs_iunlink_remove(
2235 */ 2235 */
2236STATIC int 2236STATIC int
2237xfs_ifree_cluster( 2237xfs_ifree_cluster(
2238 xfs_inode_t *free_ip, 2238 xfs_inode_t *free_ip,
2239 xfs_trans_t *tp, 2239 xfs_trans_t *tp,
2240 xfs_ino_t inum) 2240 struct xfs_icluster *xic)
2241{ 2241{
2242 xfs_mount_t *mp = free_ip->i_mount; 2242 xfs_mount_t *mp = free_ip->i_mount;
2243 int blks_per_cluster; 2243 int blks_per_cluster;
@@ -2250,13 +2250,26 @@ xfs_ifree_cluster(
2250 xfs_inode_log_item_t *iip; 2250 xfs_inode_log_item_t *iip;
2251 xfs_log_item_t *lip; 2251 xfs_log_item_t *lip;
2252 struct xfs_perag *pag; 2252 struct xfs_perag *pag;
2253 xfs_ino_t inum;
2253 2254
2255 inum = xic->first_ino;
2254 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 2256 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2255 blks_per_cluster = xfs_icluster_size_fsb(mp); 2257 blks_per_cluster = xfs_icluster_size_fsb(mp);
2256 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; 2258 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
2257 nbufs = mp->m_ialloc_blks / blks_per_cluster; 2259 nbufs = mp->m_ialloc_blks / blks_per_cluster;
2258 2260
2259 for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { 2261 for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
2262 /*
2263 * The allocation bitmap tells us which inodes of the chunk were
2264 * physically allocated. Skip the cluster if an inode falls into
2265 * a sparse region.
2266 */
2267 if ((xic->alloc & XFS_INOBT_MASK(inum - xic->first_ino)) == 0) {
2268 ASSERT(((inum - xic->first_ino) %
2269 inodes_per_cluster) == 0);
2270 continue;
2271 }
2272
2260 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2273 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2261 XFS_INO_TO_AGBNO(mp, inum)); 2274 XFS_INO_TO_AGBNO(mp, inum));
2262 2275
@@ -2414,8 +2427,7 @@ xfs_ifree(
2414 xfs_bmap_free_t *flist) 2427 xfs_bmap_free_t *flist)
2415{ 2428{
2416 int error; 2429 int error;
2417 int delete; 2430 struct xfs_icluster xic = { 0 };
2418 xfs_ino_t first_ino;
2419 2431
2420 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2432 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2421 ASSERT(ip->i_d.di_nlink == 0); 2433 ASSERT(ip->i_d.di_nlink == 0);
@@ -2431,7 +2443,7 @@ xfs_ifree(
2431 if (error) 2443 if (error)
2432 return error; 2444 return error;
2433 2445
2434 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2446 error = xfs_difree(tp, ip->i_ino, flist, &xic);
2435 if (error) 2447 if (error)
2436 return error; 2448 return error;
2437 2449
@@ -2448,8 +2460,8 @@ xfs_ifree(
2448 ip->i_d.di_gen++; 2460 ip->i_d.di_gen++;
2449 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2461 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2450 2462
2451 if (delete) 2463 if (xic.deleted)
2452 error = xfs_ifree_cluster(ip, tp, first_ino); 2464 error = xfs_ifree_cluster(ip, tp, &xic);
2453 2465
2454 return error; 2466 return error;
2455} 2467}
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 80429891dc9b..f41b0c3fddab 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk(
252 } 252 }
253 253
254 irec->ir_free |= xfs_inobt_maskn(0, idx); 254 irec->ir_free |= xfs_inobt_maskn(0, idx);
255 *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; 255 *icount = irec->ir_count - irec->ir_freecount;
256 } 256 }
257 257
258 return 0; 258 return 0;
@@ -415,6 +415,8 @@ xfs_bulkstat(
415 goto del_cursor; 415 goto del_cursor;
416 if (icount) { 416 if (icount) {
417 irbp->ir_startino = r.ir_startino; 417 irbp->ir_startino = r.ir_startino;
418 irbp->ir_holemask = r.ir_holemask;
419 irbp->ir_count = r.ir_count;
418 irbp->ir_freecount = r.ir_freecount; 420 irbp->ir_freecount = r.ir_freecount;
419 irbp->ir_free = r.ir_free; 421 irbp->ir_free = r.ir_free;
420 irbp++; 422 irbp++;
@@ -447,13 +449,15 @@ xfs_bulkstat(
447 * If this chunk has any allocated inodes, save it. 449 * If this chunk has any allocated inodes, save it.
448 * Also start read-ahead now for this chunk. 450 * Also start read-ahead now for this chunk.
449 */ 451 */
450 if (r.ir_freecount < XFS_INODES_PER_CHUNK) { 452 if (r.ir_freecount < r.ir_count) {
451 xfs_bulkstat_ichunk_ra(mp, agno, &r); 453 xfs_bulkstat_ichunk_ra(mp, agno, &r);
452 irbp->ir_startino = r.ir_startino; 454 irbp->ir_startino = r.ir_startino;
455 irbp->ir_holemask = r.ir_holemask;
456 irbp->ir_count = r.ir_count;
453 irbp->ir_freecount = r.ir_freecount; 457 irbp->ir_freecount = r.ir_freecount;
454 irbp->ir_free = r.ir_free; 458 irbp->ir_free = r.ir_free;
455 irbp++; 459 irbp++;
456 icount += XFS_INODES_PER_CHUNK - r.ir_freecount; 460 icount += r.ir_count - r.ir_freecount;
457 } 461 }
458 error = xfs_btree_increment(cur, 0, &stat); 462 error = xfs_btree_increment(cur, 0, &stat);
459 if (error || stat == 0) { 463 if (error || stat == 0) {
@@ -599,8 +603,7 @@ xfs_inumbers(
599 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; 603 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
600 buffer[bufidx].xi_startino = 604 buffer[bufidx].xi_startino =
601 XFS_AGINO_TO_INO(mp, agno, r.ir_startino); 605 XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
602 buffer[bufidx].xi_alloccount = 606 buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
603 XFS_INODES_PER_CHUNK - r.ir_freecount;
604 buffer[bufidx].xi_allocmask = ~r.ir_free; 607 buffer[bufidx].xi_allocmask = ~r.ir_free;
605 if (++bufidx == bcount) { 608 if (++bufidx == bcount) {
606 long written; 609 long written;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4f5784f85a5b..4a8c440b6280 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3068,12 +3068,22 @@ xlog_recover_do_icreate_pass2(
3068 return -EINVAL; 3068 return -EINVAL;
3069 } 3069 }
3070 3070
3071 /* existing allocation is fixed value */ 3071 /*
3072 ASSERT(count == mp->m_ialloc_inos); 3072 * The inode chunk is either full or sparse and we only support
3073 ASSERT(length == mp->m_ialloc_blks); 3073 * m_ialloc_min_blks sized sparse allocations at this time.
3074 if (count != mp->m_ialloc_inos || 3074 */
3075 length != mp->m_ialloc_blks) { 3075 if (length != mp->m_ialloc_blks &&
3076 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); 3076 length != mp->m_ialloc_min_blks) {
3077 xfs_warn(log->l_mp,
3078 "%s: unsupported chunk length", __FUNCTION__);
3079 return -EINVAL;
3080 }
3081
3082 /* verify inode count is consistent with extent length */
3083 if ((count >> mp->m_sb.sb_inopblog) != length) {
3084 xfs_warn(log->l_mp,
3085 "%s: inconsistent inode count and chunk length",
3086 __FUNCTION__);
3077 return -EINVAL; 3087 return -EINVAL;
3078 } 3088 }
3079 3089
@@ -3091,8 +3101,8 @@ xlog_recover_do_icreate_pass2(
3091 XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) 3101 XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
3092 return 0; 3102 return 0;
3093 3103
3094 xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, 3104 xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
3095 be32_to_cpu(icl->icl_gen)); 3105 be32_to_cpu(icl->icl_gen));
3096 return 0; 3106 return 0;
3097} 3107}
3098 3108
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6f23fbdfb365..461e791efad7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -725,6 +725,22 @@ xfs_mountfs(
725 } 725 }
726 726
727 /* 727 /*
728 * If enabled, sparse inode chunk alignment is expected to match the
729 * cluster size. Full inode chunk alignment must match the chunk size,
730 * but that is checked on sb read verification...
731 */
732 if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
733 mp->m_sb.sb_spino_align !=
734 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
735 xfs_warn(mp,
736 "Sparse inode block alignment (%u) must match cluster size (%llu).",
737 mp->m_sb.sb_spino_align,
738 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
739 error = -EINVAL;
740 goto out_remove_uuid;
741 }
742
743 /*
728 * Set inode alignment fields 744 * Set inode alignment fields
729 */ 745 */
730 xfs_set_inoalignment(mp); 746 xfs_set_inoalignment(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 8c995a2ccb6f..df209c290258 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -101,6 +101,8 @@ typedef struct xfs_mount {
101 __uint64_t m_flags; /* global mount flags */ 101 __uint64_t m_flags; /* global mount flags */
102 int m_ialloc_inos; /* inodes in inode allocation */ 102 int m_ialloc_inos; /* inodes in inode allocation */
103 int m_ialloc_blks; /* blocks in inode allocation */ 103 int m_ialloc_blks; /* blocks in inode allocation */
104 int m_ialloc_min_blks;/* min blocks in sparse inode
105 * allocation */
104 int m_inoalign_mask;/* mask sb_inoalignmt if used */ 106 int m_inoalign_mask;/* mask sb_inoalignmt if used */
105 uint m_qflags; /* quota status flags */ 107 uint m_qflags; /* quota status flags */
106 struct xfs_trans_resv m_resv; /* precomputed res values */ 108 struct xfs_trans_resv m_resv; /* precomputed res values */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 615781bf4ee5..8d916d33d93d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
738 __entry->blocks, __entry->shift, __entry->writeio_blocks) 738 __entry->blocks, __entry->shift, __entry->writeio_blocks)
739) 739)
740 740
741TRACE_EVENT(xfs_irec_merge_pre,
742 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
743 uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
744 TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
745 TP_STRUCT__entry(
746 __field(dev_t, dev)
747 __field(xfs_agnumber_t, agno)
748 __field(xfs_agino_t, agino)
749 __field(uint16_t, holemask)
750 __field(xfs_agino_t, nagino)
751 __field(uint16_t, nholemask)
752 ),
753 TP_fast_assign(
754 __entry->dev = mp->m_super->s_dev;
755 __entry->agno = agno;
756 __entry->agino = agino;
757 __entry->holemask = holemask;
758 __entry->nagino = nagino;
759 __entry->nholemask = holemask;
760 ),
761 TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
762 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
763 __entry->agino, __entry->holemask, __entry->nagino,
764 __entry->nholemask)
765)
766
767TRACE_EVENT(xfs_irec_merge_post,
768 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
769 uint16_t holemask),
770 TP_ARGS(mp, agno, agino, holemask),
771 TP_STRUCT__entry(
772 __field(dev_t, dev)
773 __field(xfs_agnumber_t, agno)
774 __field(xfs_agino_t, agino)
775 __field(uint16_t, holemask)
776 ),
777 TP_fast_assign(
778 __entry->dev = mp->m_super->s_dev;
779 __entry->agno = agno;
780 __entry->agino = agino;
781 __entry->holemask = holemask;
782 ),
783 TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
784 MINOR(__entry->dev), __entry->agno, __entry->agino,
785 __entry->holemask)
786)
787
741#define DEFINE_IREF_EVENT(name) \ 788#define DEFINE_IREF_EVENT(name) \
742DEFINE_EVENT(xfs_iref_class, name, \ 789DEFINE_EVENT(xfs_iref_class, name, \
743 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ 790 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \