aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/libxfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/libxfs')
-rw-r--r--fs/xfs/libxfs/xfs_ag.h281
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2630
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h234
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c504
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h65
-rw-r--r--fs/xfs/libxfs/xfs_attr.c1459
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2697
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h108
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c628
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h27
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h70
-rw-r--r--fs/xfs/libxfs/xfs_bit.h87
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c5606
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h186
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c967
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h143
-rw-r--r--fs/xfs/libxfs/xfs_btree.c4069
-rw-r--r--fs/xfs/libxfs/xfs_btree.h468
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h63
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c2665
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h221
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c911
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h861
-rw-r--r--fs/xfs/libxfs/xfs_dinode.h243
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c762
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h180
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c1265
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c1050
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c1831
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c2284
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h274
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c1184
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c290
-rw-r--r--fs/xfs/libxfs/xfs_format.h428
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c2189
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h163
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c422
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h65
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c479
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h50
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c1906
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h171
-rw-r--r--fs/xfs/libxfs/xfs_inum.h64
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h679
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h66
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c150
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h161
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c973
-rw-r--r--fs/xfs/libxfs/xfs_sb.c836
-rw-r--r--fs/xfs/libxfs/xfs_sb.h621
-rw-r--r--fs/xfs/libxfs/xfs_shared.h246
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c201
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c894
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h117
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h92
55 files changed, 45286 insertions, 0 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
new file mode 100644
index 000000000000..6e247a99f5db
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -0,0 +1,281 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_AG_H__
19#define __XFS_AG_H__
20
21/*
22 * Allocation group header
23 * This is divided into three structures, placed in sequential 512-byte
24 * buffers after a copy of the superblock (also in a 512-byte buffer).
25 */
26
27struct xfs_buf;
28struct xfs_mount;
29struct xfs_trans;
30
31#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
32#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
33#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */
34#define XFS_AGF_VERSION 1
35#define XFS_AGI_VERSION 1
36
37#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
38#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
39
40/*
41 * Btree number 0 is bno, 1 is cnt. This value gives the size of the
42 * arrays below.
43 */
44#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1)
45
46/*
47 * The second word of agf_levels in the first a.g. overlaps the EFS
48 * superblock's magic number. Since the magic numbers valid for EFS
49 * are > 64k, our value cannot be confused for an EFS superblock's.
50 */
51
52typedef struct xfs_agf {
53 /*
54 * Common allocation group header information
55 */
56 __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */
57 __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */
58 __be32 agf_seqno; /* sequence # starting from 0 */
59 __be32 agf_length; /* size in blocks of a.g. */
60 /*
61 * Freespace information
62 */
63 __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
64 __be32 agf_spare0; /* spare field */
65 __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
66 __be32 agf_spare1; /* spare field */
67
68 __be32 agf_flfirst; /* first freelist block's index */
69 __be32 agf_fllast; /* last freelist block's index */
70 __be32 agf_flcount; /* count of blocks in freelist */
71 __be32 agf_freeblks; /* total free blocks */
72
73 __be32 agf_longest; /* longest free space */
74 __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
75 uuid_t agf_uuid; /* uuid of filesystem */
76
77 /*
78 * reserve some contiguous space for future logged fields before we add
79 * the unlogged fields. This makes the range logging via flags and
80 * structure offsets much simpler.
81 */
82 __be64 agf_spare64[16];
83
84 /* unlogged fields, written during buffer writeback. */
85 __be64 agf_lsn; /* last write sequence */
86 __be32 agf_crc; /* crc of agf sector */
87 __be32 agf_spare2;
88
89 /* structure must be padded to 64 bit alignment */
90} xfs_agf_t;
91
92#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
93
94#define XFS_AGF_MAGICNUM 0x00000001
95#define XFS_AGF_VERSIONNUM 0x00000002
96#define XFS_AGF_SEQNO 0x00000004
97#define XFS_AGF_LENGTH 0x00000008
98#define XFS_AGF_ROOTS 0x00000010
99#define XFS_AGF_LEVELS 0x00000020
100#define XFS_AGF_FLFIRST 0x00000040
101#define XFS_AGF_FLLAST 0x00000080
102#define XFS_AGF_FLCOUNT 0x00000100
103#define XFS_AGF_FREEBLKS 0x00000200
104#define XFS_AGF_LONGEST 0x00000400
105#define XFS_AGF_BTREEBLKS 0x00000800
106#define XFS_AGF_UUID 0x00001000
107#define XFS_AGF_NUM_BITS 13
108#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
109
110#define XFS_AGF_FLAGS \
111 { XFS_AGF_MAGICNUM, "MAGICNUM" }, \
112 { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \
113 { XFS_AGF_SEQNO, "SEQNO" }, \
114 { XFS_AGF_LENGTH, "LENGTH" }, \
115 { XFS_AGF_ROOTS, "ROOTS" }, \
116 { XFS_AGF_LEVELS, "LEVELS" }, \
117 { XFS_AGF_FLFIRST, "FLFIRST" }, \
118 { XFS_AGF_FLLAST, "FLLAST" }, \
119 { XFS_AGF_FLCOUNT, "FLCOUNT" }, \
120 { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
121 { XFS_AGF_LONGEST, "LONGEST" }, \
122 { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
123 { XFS_AGF_UUID, "UUID" }
124
125/* disk block (xfs_daddr_t) in the AG */
126#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
127#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
128#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
129
130extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
131 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
132
133/*
134 * Size of the unlinked inode hash table in the agi.
135 */
136#define XFS_AGI_UNLINKED_BUCKETS 64
137
138typedef struct xfs_agi {
139 /*
140 * Common allocation group header information
141 */
142 __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */
143 __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */
144 __be32 agi_seqno; /* sequence # starting from 0 */
145 __be32 agi_length; /* size in blocks of a.g. */
146 /*
147 * Inode information
148 * Inodes are mapped by interpreting the inode number, so no
149 * mapping data is needed here.
150 */
151 __be32 agi_count; /* count of allocated inodes */
152 __be32 agi_root; /* root of inode btree */
153 __be32 agi_level; /* levels in inode btree */
154 __be32 agi_freecount; /* number of free inodes */
155
156 __be32 agi_newino; /* new inode just allocated */
157 __be32 agi_dirino; /* last directory inode chunk */
158 /*
159 * Hash table of inodes which have been unlinked but are
160 * still being referenced.
161 */
162 __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
163 /*
164 * This marks the end of logging region 1 and start of logging region 2.
165 */
166 uuid_t agi_uuid; /* uuid of filesystem */
167 __be32 agi_crc; /* crc of agi sector */
168 __be32 agi_pad32;
169 __be64 agi_lsn; /* last write sequence */
170
171 __be32 agi_free_root; /* root of the free inode btree */
172 __be32 agi_free_level;/* levels in free inode btree */
173
174 /* structure must be padded to 64 bit alignment */
175} xfs_agi_t;
176
177#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
178
179#define XFS_AGI_MAGICNUM (1 << 0)
180#define XFS_AGI_VERSIONNUM (1 << 1)
181#define XFS_AGI_SEQNO (1 << 2)
182#define XFS_AGI_LENGTH (1 << 3)
183#define XFS_AGI_COUNT (1 << 4)
184#define XFS_AGI_ROOT (1 << 5)
185#define XFS_AGI_LEVEL (1 << 6)
186#define XFS_AGI_FREECOUNT (1 << 7)
187#define XFS_AGI_NEWINO (1 << 8)
188#define XFS_AGI_DIRINO (1 << 9)
189#define XFS_AGI_UNLINKED (1 << 10)
190#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
191#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
192#define XFS_AGI_FREE_ROOT (1 << 11)
193#define XFS_AGI_FREE_LEVEL (1 << 12)
194#define XFS_AGI_NUM_BITS_R2 13
195
196/* disk block (xfs_daddr_t) in the AG */
197#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
198#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
199#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
200
201extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
202 xfs_agnumber_t agno, struct xfs_buf **bpp);
203
204/*
205 * The third a.g. block contains the a.g. freelist, an array
206 * of block pointers to blocks owned by the allocation btree code.
207 */
208#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
209#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
210#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
211
212#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
213 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
214 &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
215 (__be32 *)(bp)->b_addr)
216
217/*
218 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of
219 * slots in the beginning of the block for a proper header with the
220 * location information and CRC.
221 */
222#define XFS_AGFL_SIZE(mp) \
223 (((mp)->m_sb.sb_sectsize - \
224 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
225 sizeof(struct xfs_agfl) : 0)) / \
226 sizeof(xfs_agblock_t))
227
228typedef struct xfs_agfl {
229 __be32 agfl_magicnum;
230 __be32 agfl_seqno;
231 uuid_t agfl_uuid;
232 __be64 agfl_lsn;
233 __be32 agfl_crc;
234 __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
235} xfs_agfl_t;
236
237#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
238
239/*
240 * tags for inode radix tree
241 */
242#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
243 in xfs_inode_ag_iterator */
244#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
245#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
246
247#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
248#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
249 (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
250#define XFS_MIN_FREELIST(a,mp) \
251 (XFS_MIN_FREELIST_RAW( \
252 be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
253 be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
254#define XFS_MIN_FREELIST_PAG(pag,mp) \
255 (XFS_MIN_FREELIST_RAW( \
256 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
257 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
258
259#define XFS_AGB_TO_FSB(mp,agno,agbno) \
260 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
261#define XFS_FSB_TO_AGNO(mp,fsbno) \
262 ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
263#define XFS_FSB_TO_AGBNO(mp,fsbno) \
264 ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
265#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
266 ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
267 (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
268#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
269
270/*
271 * For checking for bad ranges of xfs_daddr_t's, covering multiple
272 * allocation groups or a single xfs_daddr_t that's a superblock copy.
273 */
274#define XFS_AG_CHECK_DADDR(mp,d,len) \
275 ((len) == 1 ? \
276 ASSERT((d) == XFS_SB_DADDR || \
277 xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
278 ASSERT(xfs_daddr_to_agno(mp, d) == \
279 xfs_daddr_to_agno(mp, (d) + (len) - 1)))
280
281#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
new file mode 100644
index 000000000000..4bffffe038a1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -0,0 +1,2630 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_format.h"
21#include "xfs_log_format.h"
22#include "xfs_shared.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_inode.h"
29#include "xfs_btree.h"
30#include "xfs_alloc_btree.h"
31#include "xfs_alloc.h"
32#include "xfs_extent_busy.h"
33#include "xfs_error.h"
34#include "xfs_cksum.h"
35#include "xfs_trace.h"
36#include "xfs_trans.h"
37#include "xfs_buf_item.h"
38#include "xfs_log.h"
39
40struct workqueue_struct *xfs_alloc_wq;
41
42#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
43
44#define XFSA_FIXUP_BNO_OK 1
45#define XFSA_FIXUP_CNT_OK 2
46
47STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
48STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
49STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
50STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
51 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
52
53/*
54 * Lookup the record equal to [bno, len] in the btree given by cur.
55 */
56STATIC int /* error */
57xfs_alloc_lookup_eq(
58 struct xfs_btree_cur *cur, /* btree cursor */
59 xfs_agblock_t bno, /* starting block of extent */
60 xfs_extlen_t len, /* length of extent */
61 int *stat) /* success/failure */
62{
63 cur->bc_rec.a.ar_startblock = bno;
64 cur->bc_rec.a.ar_blockcount = len;
65 return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
66}
67
68/*
69 * Lookup the first record greater than or equal to [bno, len]
70 * in the btree given by cur.
71 */
72int /* error */
73xfs_alloc_lookup_ge(
74 struct xfs_btree_cur *cur, /* btree cursor */
75 xfs_agblock_t bno, /* starting block of extent */
76 xfs_extlen_t len, /* length of extent */
77 int *stat) /* success/failure */
78{
79 cur->bc_rec.a.ar_startblock = bno;
80 cur->bc_rec.a.ar_blockcount = len;
81 return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
82}
83
84/*
85 * Lookup the first record less than or equal to [bno, len]
86 * in the btree given by cur.
87 */
88int /* error */
89xfs_alloc_lookup_le(
90 struct xfs_btree_cur *cur, /* btree cursor */
91 xfs_agblock_t bno, /* starting block of extent */
92 xfs_extlen_t len, /* length of extent */
93 int *stat) /* success/failure */
94{
95 cur->bc_rec.a.ar_startblock = bno;
96 cur->bc_rec.a.ar_blockcount = len;
97 return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
98}
99
100/*
101 * Update the record referred to by cur to the value given
102 * by [bno, len].
103 * This either works (return 0) or gets an EFSCORRUPTED error.
104 */
105STATIC int /* error */
106xfs_alloc_update(
107 struct xfs_btree_cur *cur, /* btree cursor */
108 xfs_agblock_t bno, /* starting block of extent */
109 xfs_extlen_t len) /* length of extent */
110{
111 union xfs_btree_rec rec;
112
113 rec.alloc.ar_startblock = cpu_to_be32(bno);
114 rec.alloc.ar_blockcount = cpu_to_be32(len);
115 return xfs_btree_update(cur, &rec);
116}
117
118/*
119 * Get the data from the pointed-to record.
120 */
121int /* error */
122xfs_alloc_get_rec(
123 struct xfs_btree_cur *cur, /* btree cursor */
124 xfs_agblock_t *bno, /* output: starting block of extent */
125 xfs_extlen_t *len, /* output: length of extent */
126 int *stat) /* output: success/failure */
127{
128 union xfs_btree_rec *rec;
129 int error;
130
131 error = xfs_btree_get_rec(cur, &rec, stat);
132 if (!error && *stat == 1) {
133 *bno = be32_to_cpu(rec->alloc.ar_startblock);
134 *len = be32_to_cpu(rec->alloc.ar_blockcount);
135 }
136 return error;
137}
138
139/*
140 * Compute aligned version of the found extent.
141 * Takes alignment and min length into account.
142 */
143STATIC void
144xfs_alloc_compute_aligned(
145 xfs_alloc_arg_t *args, /* allocation argument structure */
146 xfs_agblock_t foundbno, /* starting block in found extent */
147 xfs_extlen_t foundlen, /* length in found extent */
148 xfs_agblock_t *resbno, /* result block number */
149 xfs_extlen_t *reslen) /* result length */
150{
151 xfs_agblock_t bno;
152 xfs_extlen_t len;
153
154 /* Trim busy sections out of found extent */
155 xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
156
157 if (args->alignment > 1 && len >= args->minlen) {
158 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
159 xfs_extlen_t diff = aligned_bno - bno;
160
161 *resbno = aligned_bno;
162 *reslen = diff >= len ? 0 : len - diff;
163 } else {
164 *resbno = bno;
165 *reslen = len;
166 }
167}
168
169/*
170 * Compute best start block and diff for "near" allocations.
171 * freelen >= wantlen already checked by caller.
172 */
173STATIC xfs_extlen_t /* difference value (absolute) */
174xfs_alloc_compute_diff(
175 xfs_agblock_t wantbno, /* target starting block */
176 xfs_extlen_t wantlen, /* target length */
177 xfs_extlen_t alignment, /* target alignment */
178 char userdata, /* are we allocating data? */
179 xfs_agblock_t freebno, /* freespace's starting block */
180 xfs_extlen_t freelen, /* freespace's length */
181 xfs_agblock_t *newbnop) /* result: best start block from free */
182{
183 xfs_agblock_t freeend; /* end of freespace extent */
184 xfs_agblock_t newbno1; /* return block number */
185 xfs_agblock_t newbno2; /* other new block number */
186 xfs_extlen_t newlen1=0; /* length with newbno1 */
187 xfs_extlen_t newlen2=0; /* length with newbno2 */
188 xfs_agblock_t wantend; /* end of target extent */
189
190 ASSERT(freelen >= wantlen);
191 freeend = freebno + freelen;
192 wantend = wantbno + wantlen;
193 /*
194 * We want to allocate from the start of a free extent if it is past
195 * the desired block or if we are allocating user data and the free
196 * extent is before desired block. The second case is there to allow
197 * for contiguous allocation from the remaining free space if the file
198 * grows in the short term.
199 */
200 if (freebno >= wantbno || (userdata && freeend < wantend)) {
201 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
202 newbno1 = NULLAGBLOCK;
203 } else if (freeend >= wantend && alignment > 1) {
204 newbno1 = roundup(wantbno, alignment);
205 newbno2 = newbno1 - alignment;
206 if (newbno1 >= freeend)
207 newbno1 = NULLAGBLOCK;
208 else
209 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
210 if (newbno2 < freebno)
211 newbno2 = NULLAGBLOCK;
212 else
213 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
214 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
215 if (newlen1 < newlen2 ||
216 (newlen1 == newlen2 &&
217 XFS_ABSDIFF(newbno1, wantbno) >
218 XFS_ABSDIFF(newbno2, wantbno)))
219 newbno1 = newbno2;
220 } else if (newbno2 != NULLAGBLOCK)
221 newbno1 = newbno2;
222 } else if (freeend >= wantend) {
223 newbno1 = wantbno;
224 } else if (alignment > 1) {
225 newbno1 = roundup(freeend - wantlen, alignment);
226 if (newbno1 > freeend - wantlen &&
227 newbno1 - alignment >= freebno)
228 newbno1 -= alignment;
229 else if (newbno1 >= freeend)
230 newbno1 = NULLAGBLOCK;
231 } else
232 newbno1 = freeend - wantlen;
233 *newbnop = newbno1;
234 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
235}
236
237/*
238 * Fix up the length, based on mod and prod.
239 * len should be k * prod + mod for some k.
240 * If len is too small it is returned unchanged.
241 * If len hits maxlen it is left alone.
242 */
243STATIC void
244xfs_alloc_fix_len(
245 xfs_alloc_arg_t *args) /* allocation argument structure */
246{
247 xfs_extlen_t k;
248 xfs_extlen_t rlen;
249
250 ASSERT(args->mod < args->prod);
251 rlen = args->len;
252 ASSERT(rlen >= args->minlen);
253 ASSERT(rlen <= args->maxlen);
254 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
255 (args->mod == 0 && rlen < args->prod))
256 return;
257 k = rlen % args->prod;
258 if (k == args->mod)
259 return;
260 if (k > args->mod)
261 rlen = rlen - (k - args->mod);
262 else
263 rlen = rlen - args->prod + (args->mod - k);
264 if ((int)rlen < (int)args->minlen)
265 return;
266 ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
267 ASSERT(rlen % args->prod == args->mod);
268 args->len = rlen;
269}
270
271/*
272 * Fix up length if there is too little space left in the a.g.
273 * Return 1 if ok, 0 if too little, should give up.
274 */
275STATIC int
276xfs_alloc_fix_minleft(
277 xfs_alloc_arg_t *args) /* allocation argument structure */
278{
279 xfs_agf_t *agf; /* a.g. freelist header */
280 int diff; /* free space difference */
281
282 if (args->minleft == 0)
283 return 1;
284 agf = XFS_BUF_TO_AGF(args->agbp);
285 diff = be32_to_cpu(agf->agf_freeblks)
286 - args->len - args->minleft;
287 if (diff >= 0)
288 return 1;
289 args->len += diff; /* shrink the allocated space */
290 if (args->len >= args->minlen)
291 return 1;
292 args->agbno = NULLAGBLOCK;
293 return 0;
294}
295
296/*
297 * Update the two btrees, logically removing from freespace the extent
298 * starting at rbno, rlen blocks. The extent is contained within the
299 * actual (current) free extent fbno for flen blocks.
300 * Flags are passed in indicating whether the cursors are set to the
301 * relevant records.
302 */
303STATIC int /* error code */
304xfs_alloc_fixup_trees(
305 xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
306 xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
307 xfs_agblock_t fbno, /* starting block of free extent */
308 xfs_extlen_t flen, /* length of free extent */
309 xfs_agblock_t rbno, /* starting block of returned extent */
310 xfs_extlen_t rlen, /* length of returned extent */
311 int flags) /* flags, XFSA_FIXUP_... */
312{
313 int error; /* error code */
314 int i; /* operation results */
315 xfs_agblock_t nfbno1; /* first new free startblock */
316 xfs_agblock_t nfbno2; /* second new free startblock */
317 xfs_extlen_t nflen1=0; /* first new free length */
318 xfs_extlen_t nflen2=0; /* second new free length */
319
320 /*
321 * Look up the record in the by-size tree if necessary.
322 */
323 if (flags & XFSA_FIXUP_CNT_OK) {
324#ifdef DEBUG
325 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
326 return error;
327 XFS_WANT_CORRUPTED_RETURN(
328 i == 1 && nfbno1 == fbno && nflen1 == flen);
329#endif
330 } else {
331 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
332 return error;
333 XFS_WANT_CORRUPTED_RETURN(i == 1);
334 }
335 /*
336 * Look up the record in the by-block tree if necessary.
337 */
338 if (flags & XFSA_FIXUP_BNO_OK) {
339#ifdef DEBUG
340 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
341 return error;
342 XFS_WANT_CORRUPTED_RETURN(
343 i == 1 && nfbno1 == fbno && nflen1 == flen);
344#endif
345 } else {
346 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
347 return error;
348 XFS_WANT_CORRUPTED_RETURN(i == 1);
349 }
350
351#ifdef DEBUG
352 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
353 struct xfs_btree_block *bnoblock;
354 struct xfs_btree_block *cntblock;
355
356 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
357 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
358
359 XFS_WANT_CORRUPTED_RETURN(
360 bnoblock->bb_numrecs == cntblock->bb_numrecs);
361 }
362#endif
363
364 /*
365 * Deal with all four cases: the allocated record is contained
366 * within the freespace record, so we can have new freespace
367 * at either (or both) end, or no freespace remaining.
368 */
369 if (rbno == fbno && rlen == flen)
370 nfbno1 = nfbno2 = NULLAGBLOCK;
371 else if (rbno == fbno) {
372 nfbno1 = rbno + rlen;
373 nflen1 = flen - rlen;
374 nfbno2 = NULLAGBLOCK;
375 } else if (rbno + rlen == fbno + flen) {
376 nfbno1 = fbno;
377 nflen1 = flen - rlen;
378 nfbno2 = NULLAGBLOCK;
379 } else {
380 nfbno1 = fbno;
381 nflen1 = rbno - fbno;
382 nfbno2 = rbno + rlen;
383 nflen2 = (fbno + flen) - nfbno2;
384 }
385 /*
386 * Delete the entry from the by-size btree.
387 */
388 if ((error = xfs_btree_delete(cnt_cur, &i)))
389 return error;
390 XFS_WANT_CORRUPTED_RETURN(i == 1);
391 /*
392 * Add new by-size btree entry(s).
393 */
394 if (nfbno1 != NULLAGBLOCK) {
395 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
396 return error;
397 XFS_WANT_CORRUPTED_RETURN(i == 0);
398 if ((error = xfs_btree_insert(cnt_cur, &i)))
399 return error;
400 XFS_WANT_CORRUPTED_RETURN(i == 1);
401 }
402 if (nfbno2 != NULLAGBLOCK) {
403 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
404 return error;
405 XFS_WANT_CORRUPTED_RETURN(i == 0);
406 if ((error = xfs_btree_insert(cnt_cur, &i)))
407 return error;
408 XFS_WANT_CORRUPTED_RETURN(i == 1);
409 }
410 /*
411 * Fix up the by-block btree entry(s).
412 */
413 if (nfbno1 == NULLAGBLOCK) {
414 /*
415 * No remaining freespace, just delete the by-block tree entry.
416 */
417 if ((error = xfs_btree_delete(bno_cur, &i)))
418 return error;
419 XFS_WANT_CORRUPTED_RETURN(i == 1);
420 } else {
421 /*
422 * Update the by-block entry to start later|be shorter.
423 */
424 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
425 return error;
426 }
427 if (nfbno2 != NULLAGBLOCK) {
428 /*
429 * 2 resulting free entries, need to add one.
430 */
431 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
432 return error;
433 XFS_WANT_CORRUPTED_RETURN(i == 0);
434 if ((error = xfs_btree_insert(bno_cur, &i)))
435 return error;
436 XFS_WANT_CORRUPTED_RETURN(i == 1);
437 }
438 return 0;
439}
440
441static bool
442xfs_agfl_verify(
443 struct xfs_buf *bp)
444{
445 struct xfs_mount *mp = bp->b_target->bt_mount;
446 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
447 int i;
448
449 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
450 return false;
451 if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
452 return false;
453 /*
454 * during growfs operations, the perag is not fully initialised,
455 * so we can't use it for any useful checking. growfs ensures we can't
456 * use it by using uncached buffers that don't have the perag attached
457 * so we can detect and avoid this problem.
458 */
459 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
460 return false;
461
462 for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
463 if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
464 be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
465 return false;
466 }
467 return true;
468}
469
470static void
471xfs_agfl_read_verify(
472 struct xfs_buf *bp)
473{
474 struct xfs_mount *mp = bp->b_target->bt_mount;
475
476 /*
477 * There is no verification of non-crc AGFLs because mkfs does not
478 * initialise the AGFL to zero or NULL. Hence the only valid part of the
479 * AGFL is what the AGF says is active. We can't get to the AGF, so we
480 * can't verify just those entries are valid.
481 */
482 if (!xfs_sb_version_hascrc(&mp->m_sb))
483 return;
484
485 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
486 xfs_buf_ioerror(bp, -EFSBADCRC);
487 else if (!xfs_agfl_verify(bp))
488 xfs_buf_ioerror(bp, -EFSCORRUPTED);
489
490 if (bp->b_error)
491 xfs_verifier_error(bp);
492}
493
494static void
495xfs_agfl_write_verify(
496 struct xfs_buf *bp)
497{
498 struct xfs_mount *mp = bp->b_target->bt_mount;
499 struct xfs_buf_log_item *bip = bp->b_fspriv;
500
501 /* no verification of non-crc AGFLs */
502 if (!xfs_sb_version_hascrc(&mp->m_sb))
503 return;
504
505 if (!xfs_agfl_verify(bp)) {
506 xfs_buf_ioerror(bp, -EFSCORRUPTED);
507 xfs_verifier_error(bp);
508 return;
509 }
510
511 if (bip)
512 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
513
514 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
515}
516
517const struct xfs_buf_ops xfs_agfl_buf_ops = {
518 .verify_read = xfs_agfl_read_verify,
519 .verify_write = xfs_agfl_write_verify,
520};
521
522/*
523 * Read in the allocation group free block array.
524 */
525STATIC int /* error */
526xfs_alloc_read_agfl(
527 xfs_mount_t *mp, /* mount point structure */
528 xfs_trans_t *tp, /* transaction pointer */
529 xfs_agnumber_t agno, /* allocation group number */
530 xfs_buf_t **bpp) /* buffer for the ag free block array */
531{
532 xfs_buf_t *bp; /* return value */
533 int error;
534
535 ASSERT(agno != NULLAGNUMBER);
536 error = xfs_trans_read_buf(
537 mp, tp, mp->m_ddev_targp,
538 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
539 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
540 if (error)
541 return error;
542 xfs_buf_set_ref(bp, XFS_AGFL_REF);
543 *bpp = bp;
544 return 0;
545}
546
547STATIC int
548xfs_alloc_update_counters(
549 struct xfs_trans *tp,
550 struct xfs_perag *pag,
551 struct xfs_buf *agbp,
552 long len)
553{
554 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
555
556 pag->pagf_freeblks += len;
557 be32_add_cpu(&agf->agf_freeblks, len);
558
559 xfs_trans_agblocks_delta(tp, len);
560 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
561 be32_to_cpu(agf->agf_length)))
562 return -EFSCORRUPTED;
563
564 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
565 return 0;
566}
567
568/*
569 * Allocation group level functions.
570 */
571
572/*
573 * Allocate a variable extent in the allocation group agno.
574 * Type and bno are used to determine where in the allocation group the
575 * extent will start.
576 * Extent's length (returned in *len) will be between minlen and maxlen,
577 * and of the form k * prod + mod unless there's nothing that large.
578 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
579 */
580STATIC int /* error */
581xfs_alloc_ag_vextent(
582 xfs_alloc_arg_t *args) /* argument structure for allocation */
583{
584 int error=0;
585
586 ASSERT(args->minlen > 0);
587 ASSERT(args->maxlen > 0);
588 ASSERT(args->minlen <= args->maxlen);
589 ASSERT(args->mod < args->prod);
590 ASSERT(args->alignment > 0);
591 /*
592 * Branch to correct routine based on the type.
593 */
594 args->wasfromfl = 0;
595 switch (args->type) {
596 case XFS_ALLOCTYPE_THIS_AG:
597 error = xfs_alloc_ag_vextent_size(args);
598 break;
599 case XFS_ALLOCTYPE_NEAR_BNO:
600 error = xfs_alloc_ag_vextent_near(args);
601 break;
602 case XFS_ALLOCTYPE_THIS_BNO:
603 error = xfs_alloc_ag_vextent_exact(args);
604 break;
605 default:
606 ASSERT(0);
607 /* NOTREACHED */
608 }
609
610 if (error || args->agbno == NULLAGBLOCK)
611 return error;
612
613 ASSERT(args->len >= args->minlen);
614 ASSERT(args->len <= args->maxlen);
615 ASSERT(!args->wasfromfl || !args->isfl);
616 ASSERT(args->agbno % args->alignment == 0);
617
618 if (!args->wasfromfl) {
619 error = xfs_alloc_update_counters(args->tp, args->pag,
620 args->agbp,
621 -((long)(args->len)));
622 if (error)
623 return error;
624
625 ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
626 args->agbno, args->len));
627 }
628
629 if (!args->isfl) {
630 xfs_trans_mod_sb(args->tp, args->wasdel ?
631 XFS_TRANS_SB_RES_FDBLOCKS :
632 XFS_TRANS_SB_FDBLOCKS,
633 -((long)(args->len)));
634 }
635
636 XFS_STATS_INC(xs_allocx);
637 XFS_STATS_ADD(xs_allocb, args->len);
638 return error;
639}
640
641/*
642 * Allocate a variable extent at exactly agno/bno.
643 * Extent's length (returned in *len) will be between minlen and maxlen,
644 * and of the form k * prod + mod unless there's nothing that large.
645 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
646 */
647STATIC int /* error */
648xfs_alloc_ag_vextent_exact(
649 xfs_alloc_arg_t *args) /* allocation argument structure */
650{
651 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
652 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
653 int error;
654 xfs_agblock_t fbno; /* start block of found extent */
655 xfs_extlen_t flen; /* length of found extent */
656 xfs_agblock_t tbno; /* start block of trimmed extent */
657 xfs_extlen_t tlen; /* length of trimmed extent */
658 xfs_agblock_t tend; /* end block of trimmed extent */
659 int i; /* success/failure of operation */
660
661 ASSERT(args->alignment == 1);
662
663 /*
664 * Allocate/initialize a cursor for the by-number freespace btree.
665 */
666 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
667 args->agno, XFS_BTNUM_BNO);
668
669 /*
670 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
671 * Look for the closest free block <= bno, it must contain bno
672 * if any free block does.
673 */
674 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
675 if (error)
676 goto error0;
677 if (!i)
678 goto not_found;
679
680 /*
681 * Grab the freespace record.
682 */
683 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
684 if (error)
685 goto error0;
686 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
687 ASSERT(fbno <= args->agbno);
688
689 /*
690 * Check for overlapping busy extents.
691 */
692 xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
693
694 /*
695 * Give up if the start of the extent is busy, or the freespace isn't
696 * long enough for the minimum request.
697 */
698 if (tbno > args->agbno)
699 goto not_found;
700 if (tlen < args->minlen)
701 goto not_found;
702 tend = tbno + tlen;
703 if (tend < args->agbno + args->minlen)
704 goto not_found;
705
706 /*
707 * End of extent will be smaller of the freespace end and the
708 * maximal requested end.
709 *
710 * Fix the length according to mod and prod if given.
711 */
712 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
713 - args->agbno;
714 xfs_alloc_fix_len(args);
715 if (!xfs_alloc_fix_minleft(args))
716 goto not_found;
717
718 ASSERT(args->agbno + args->len <= tend);
719
720 /*
721 * We are allocating agbno for args->len
722 * Allocate/initialize a cursor for the by-size btree.
723 */
724 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
725 args->agno, XFS_BTNUM_CNT);
726 ASSERT(args->agbno + args->len <=
727 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
728 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
729 args->len, XFSA_FIXUP_BNO_OK);
730 if (error) {
731 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
732 goto error0;
733 }
734
735 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
736 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
737
738 args->wasfromfl = 0;
739 trace_xfs_alloc_exact_done(args);
740 return 0;
741
742not_found:
743 /* Didn't find it, return null. */
744 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
745 args->agbno = NULLAGBLOCK;
746 trace_xfs_alloc_exact_notfound(args);
747 return 0;
748
749error0:
750 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
751 trace_xfs_alloc_exact_error(args);
752 return error;
753}
754
755/*
756 * Search the btree in a given direction via the search cursor and compare
757 * the records found against the good extent we've already found.
758 */
759STATIC int
760xfs_alloc_find_best_extent(
761 struct xfs_alloc_arg *args, /* allocation argument structure */
762 struct xfs_btree_cur **gcur, /* good cursor */
763 struct xfs_btree_cur **scur, /* searching cursor */
764 xfs_agblock_t gdiff, /* difference for search comparison */
765 xfs_agblock_t *sbno, /* extent found by search */
766 xfs_extlen_t *slen, /* extent length */
767 xfs_agblock_t *sbnoa, /* aligned extent found by search */
768 xfs_extlen_t *slena, /* aligned extent length */
769 int dir) /* 0 = search right, 1 = search left */
770{
771 xfs_agblock_t new;
772 xfs_agblock_t sdiff;
773 int error;
774 int i;
775
776 /* The good extent is perfect, no need to search. */
777 if (!gdiff)
778 goto out_use_good;
779
780 /*
781 * Look until we find a better one, run out of space or run off the end.
782 */
783 do {
784 error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
785 if (error)
786 goto error0;
787 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
788 xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
789
790 /*
791 * The good extent is closer than this one.
792 */
793 if (!dir) {
794 if (*sbnoa >= args->agbno + gdiff)
795 goto out_use_good;
796 } else {
797 if (*sbnoa <= args->agbno - gdiff)
798 goto out_use_good;
799 }
800
801 /*
802 * Same distance, compare length and pick the best.
803 */
804 if (*slena >= args->minlen) {
805 args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
806 xfs_alloc_fix_len(args);
807
808 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
809 args->alignment,
810 args->userdata, *sbnoa,
811 *slena, &new);
812
813 /*
814 * Choose closer size and invalidate other cursor.
815 */
816 if (sdiff < gdiff)
817 goto out_use_search;
818 goto out_use_good;
819 }
820
821 if (!dir)
822 error = xfs_btree_increment(*scur, 0, &i);
823 else
824 error = xfs_btree_decrement(*scur, 0, &i);
825 if (error)
826 goto error0;
827 } while (i);
828
829out_use_good:
830 xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
831 *scur = NULL;
832 return 0;
833
834out_use_search:
835 xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
836 *gcur = NULL;
837 return 0;
838
839error0:
840 /* caller invalidates cursors */
841 return error;
842}
843
844/*
845 * Allocate a variable extent near bno in the allocation group agno.
846 * Extent's length (returned in len) will be between minlen and maxlen,
847 * and of the form k * prod + mod unless there's nothing that large.
848 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
849 */
850STATIC int /* error */
851xfs_alloc_ag_vextent_near(
852 xfs_alloc_arg_t *args) /* allocation argument structure */
853{
854 xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */
855 xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */
856 xfs_btree_cur_t *cnt_cur; /* cursor for count btree */
857 xfs_agblock_t gtbno; /* start bno of right side entry */
858 xfs_agblock_t gtbnoa; /* aligned ... */
859 xfs_extlen_t gtdiff; /* difference to right side entry */
860 xfs_extlen_t gtlen; /* length of right side entry */
861 xfs_extlen_t gtlena; /* aligned ... */
862 xfs_agblock_t gtnew; /* useful start bno of right side */
863 int error; /* error code */
864 int i; /* result code, temporary */
865 int j; /* result code, temporary */
866 xfs_agblock_t ltbno; /* start bno of left side entry */
867 xfs_agblock_t ltbnoa; /* aligned ... */
868 xfs_extlen_t ltdiff; /* difference to left side entry */
869 xfs_extlen_t ltlen; /* length of left side entry */
870 xfs_extlen_t ltlena; /* aligned ... */
871 xfs_agblock_t ltnew; /* useful start bno of left side */
872 xfs_extlen_t rlen; /* length of returned extent */
873 int forced = 0;
874#ifdef DEBUG
875 /*
876 * Randomly don't execute the first algorithm.
877 */
878 int dofirst; /* set to do first algorithm */
879
880 dofirst = prandom_u32() & 1;
881#endif
882
883restart:
884 bno_cur_lt = NULL;
885 bno_cur_gt = NULL;
886 ltlen = 0;
887 gtlena = 0;
888 ltlena = 0;
889
890 /*
891 * Get a cursor for the by-size btree.
892 */
893 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
894 args->agno, XFS_BTNUM_CNT);
895
896 /*
897 * See if there are any free extents as big as maxlen.
898 */
899 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
900 goto error0;
901 /*
902 * If none, then pick up the last entry in the tree unless the
903 * tree is empty.
904 */
905 if (!i) {
906 if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
907 &ltlen, &i)))
908 goto error0;
909 if (i == 0 || ltlen == 0) {
910 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
911 trace_xfs_alloc_near_noentry(args);
912 return 0;
913 }
914 ASSERT(i == 1);
915 }
916 args->wasfromfl = 0;
917
918 /*
919 * First algorithm.
920 * If the requested extent is large wrt the freespaces available
921 * in this a.g., then the cursor will be pointing to a btree entry
922 * near the right edge of the tree. If it's in the last btree leaf
923 * block, then we just examine all the entries in that block
924 * that are big enough, and pick the best one.
925 * This is written as a while loop so we can break out of it,
926 * but we never loop back to the top.
927 */
928 while (xfs_btree_islastblock(cnt_cur, 0)) {
929 xfs_extlen_t bdiff;
930 int besti=0;
931 xfs_extlen_t blen=0;
932 xfs_agblock_t bnew=0;
933
934#ifdef DEBUG
935 if (dofirst)
936 break;
937#endif
938 /*
939 * Start from the entry that lookup found, sequence through
940 * all larger free blocks. If we're actually pointing at a
941 * record smaller than maxlen, go to the start of this block,
942 * and skip all those smaller than minlen.
943 */
944 if (ltlen || args->alignment > 1) {
945 cnt_cur->bc_ptrs[0] = 1;
946 do {
947 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
948 &ltlen, &i)))
949 goto error0;
950 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
951 if (ltlen >= args->minlen)
952 break;
953 if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
954 goto error0;
955 } while (i);
956 ASSERT(ltlen >= args->minlen);
957 if (!i)
958 break;
959 }
960 i = cnt_cur->bc_ptrs[0];
961 for (j = 1, blen = 0, bdiff = 0;
962 !error && j && (blen < args->maxlen || bdiff > 0);
963 error = xfs_btree_increment(cnt_cur, 0, &j)) {
964 /*
965 * For each entry, decide if it's better than
966 * the previous best entry.
967 */
968 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
969 goto error0;
970 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
971 xfs_alloc_compute_aligned(args, ltbno, ltlen,
972 &ltbnoa, &ltlena);
973 if (ltlena < args->minlen)
974 continue;
975 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
976 xfs_alloc_fix_len(args);
977 ASSERT(args->len >= args->minlen);
978 if (args->len < blen)
979 continue;
980 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
981 args->alignment, args->userdata, ltbnoa,
982 ltlena, &ltnew);
983 if (ltnew != NULLAGBLOCK &&
984 (args->len > blen || ltdiff < bdiff)) {
985 bdiff = ltdiff;
986 bnew = ltnew;
987 blen = args->len;
988 besti = cnt_cur->bc_ptrs[0];
989 }
990 }
991 /*
992 * It didn't work. We COULD be in a case where
993 * there's a good record somewhere, so try again.
994 */
995 if (blen == 0)
996 break;
997 /*
998 * Point at the best entry, and retrieve it again.
999 */
1000 cnt_cur->bc_ptrs[0] = besti;
1001 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
1002 goto error0;
1003 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1004 ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1005 args->len = blen;
1006 if (!xfs_alloc_fix_minleft(args)) {
1007 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1008 trace_xfs_alloc_near_nominleft(args);
1009 return 0;
1010 }
1011 blen = args->len;
1012 /*
1013 * We are allocating starting at bnew for blen blocks.
1014 */
1015 args->agbno = bnew;
1016 ASSERT(bnew >= ltbno);
1017 ASSERT(bnew + blen <= ltbno + ltlen);
1018 /*
1019 * Set up a cursor for the by-bno tree.
1020 */
1021 bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
1022 args->agbp, args->agno, XFS_BTNUM_BNO);
1023 /*
1024 * Fix up the btree entries.
1025 */
1026 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
1027 ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
1028 goto error0;
1029 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1030 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
1031
1032 trace_xfs_alloc_near_first(args);
1033 return 0;
1034 }
1035 /*
1036 * Second algorithm.
1037 * Search in the by-bno tree to the left and to the right
1038 * simultaneously, until in each case we find a space big enough,
1039 * or run into the edge of the tree. When we run into the edge,
1040 * we deallocate that cursor.
1041 * If both searches succeed, we compare the two spaces and pick
1042 * the better one.
1043 * With alignment, it's possible for both to fail; the upper
1044 * level algorithm that picks allocation groups for allocations
1045 * is not supposed to do this.
1046 */
1047 /*
1048 * Allocate and initialize the cursor for the leftward search.
1049 */
1050 bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1051 args->agno, XFS_BTNUM_BNO);
1052 /*
1053 * Lookup <= bno to find the leftward search's starting point.
1054 */
1055 if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
1056 goto error0;
1057 if (!i) {
1058 /*
1059 * Didn't find anything; use this cursor for the rightward
1060 * search.
1061 */
1062 bno_cur_gt = bno_cur_lt;
1063 bno_cur_lt = NULL;
1064 }
1065 /*
1066 * Found something. Duplicate the cursor for the rightward search.
1067 */
1068 else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
1069 goto error0;
1070 /*
1071 * Increment the cursor, so we will point at the entry just right
1072 * of the leftward entry if any, or to the leftmost entry.
1073 */
1074 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
1075 goto error0;
1076 if (!i) {
1077 /*
1078 * It failed, there are no rightward entries.
1079 */
1080 xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
1081 bno_cur_gt = NULL;
1082 }
1083 /*
1084 * Loop going left with the leftward cursor, right with the
1085 * rightward cursor, until either both directions give up or
1086 * we find an entry at least as big as minlen.
1087 */
1088 do {
1089 if (bno_cur_lt) {
1090 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
1091 goto error0;
1092 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1093 xfs_alloc_compute_aligned(args, ltbno, ltlen,
1094 &ltbnoa, &ltlena);
1095 if (ltlena >= args->minlen)
1096 break;
1097 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
1098 goto error0;
1099 if (!i) {
1100 xfs_btree_del_cursor(bno_cur_lt,
1101 XFS_BTREE_NOERROR);
1102 bno_cur_lt = NULL;
1103 }
1104 }
1105 if (bno_cur_gt) {
1106 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
1107 goto error0;
1108 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1109 xfs_alloc_compute_aligned(args, gtbno, gtlen,
1110 &gtbnoa, &gtlena);
1111 if (gtlena >= args->minlen)
1112 break;
1113 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
1114 goto error0;
1115 if (!i) {
1116 xfs_btree_del_cursor(bno_cur_gt,
1117 XFS_BTREE_NOERROR);
1118 bno_cur_gt = NULL;
1119 }
1120 }
1121 } while (bno_cur_lt || bno_cur_gt);
1122
1123 /*
1124 * Got both cursors still active, need to find better entry.
1125 */
1126 if (bno_cur_lt && bno_cur_gt) {
1127 if (ltlena >= args->minlen) {
1128 /*
1129 * Left side is good, look for a right side entry.
1130 */
1131 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1132 xfs_alloc_fix_len(args);
1133 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1134 args->alignment, args->userdata, ltbnoa,
1135 ltlena, &ltnew);
1136
1137 error = xfs_alloc_find_best_extent(args,
1138 &bno_cur_lt, &bno_cur_gt,
1139 ltdiff, &gtbno, &gtlen,
1140 &gtbnoa, &gtlena,
1141 0 /* search right */);
1142 } else {
1143 ASSERT(gtlena >= args->minlen);
1144
1145 /*
1146 * Right side is good, look for a left side entry.
1147 */
1148 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1149 xfs_alloc_fix_len(args);
1150 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1151 args->alignment, args->userdata, gtbnoa,
1152 gtlena, &gtnew);
1153
1154 error = xfs_alloc_find_best_extent(args,
1155 &bno_cur_gt, &bno_cur_lt,
1156 gtdiff, &ltbno, &ltlen,
1157 &ltbnoa, &ltlena,
1158 1 /* search left */);
1159 }
1160
1161 if (error)
1162 goto error0;
1163 }
1164
1165 /*
1166 * If we couldn't get anything, give up.
1167 */
1168 if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
1169 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1170
1171 if (!forced++) {
1172 trace_xfs_alloc_near_busy(args);
1173 xfs_log_force(args->mp, XFS_LOG_SYNC);
1174 goto restart;
1175 }
1176 trace_xfs_alloc_size_neither(args);
1177 args->agbno = NULLAGBLOCK;
1178 return 0;
1179 }
1180
1181 /*
1182 * At this point we have selected a freespace entry, either to the
1183 * left or to the right. If it's on the right, copy all the
1184 * useful variables to the "left" set so we only have one
1185 * copy of this code.
1186 */
1187 if (bno_cur_gt) {
1188 bno_cur_lt = bno_cur_gt;
1189 bno_cur_gt = NULL;
1190 ltbno = gtbno;
1191 ltbnoa = gtbnoa;
1192 ltlen = gtlen;
1193 ltlena = gtlena;
1194 j = 1;
1195 } else
1196 j = 0;
1197
1198 /*
1199 * Fix up the length and compute the useful address.
1200 */
1201 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1202 xfs_alloc_fix_len(args);
1203 if (!xfs_alloc_fix_minleft(args)) {
1204 trace_xfs_alloc_near_nominleft(args);
1205 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
1206 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1207 return 0;
1208 }
1209 rlen = args->len;
1210 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
1211 args->userdata, ltbnoa, ltlena, &ltnew);
1212 ASSERT(ltnew >= ltbno);
1213 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1214 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1215 args->agbno = ltnew;
1216
1217 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
1218 ltnew, rlen, XFSA_FIXUP_BNO_OK)))
1219 goto error0;
1220
1221 if (j)
1222 trace_xfs_alloc_near_greater(args);
1223 else
1224 trace_xfs_alloc_near_lesser(args);
1225
1226 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1227 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
1228 return 0;
1229
1230 error0:
1231 trace_xfs_alloc_near_error(args);
1232 if (cnt_cur != NULL)
1233 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1234 if (bno_cur_lt != NULL)
1235 xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
1236 if (bno_cur_gt != NULL)
1237 xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
1238 return error;
1239}
1240
1241/*
1242 * Allocate a variable extent anywhere in the allocation group agno.
1243 * Extent's length (returned in len) will be between minlen and maxlen,
1244 * and of the form k * prod + mod unless there's nothing that large.
1245 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1246 */
1247STATIC int /* error */
1248xfs_alloc_ag_vextent_size(
1249 xfs_alloc_arg_t *args) /* allocation argument structure */
1250{
1251 xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
1252 xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
1253 int error; /* error result */
1254 xfs_agblock_t fbno; /* start of found freespace */
1255 xfs_extlen_t flen; /* length of found freespace */
1256 int i; /* temp status variable */
1257 xfs_agblock_t rbno; /* returned block number */
1258 xfs_extlen_t rlen; /* length of returned extent */
1259 int forced = 0;
1260
1261restart:
1262 /*
1263 * Allocate and initialize a cursor for the by-size btree.
1264 */
1265 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1266 args->agno, XFS_BTNUM_CNT);
1267 bno_cur = NULL;
1268
1269 /*
1270 * Look for an entry >= maxlen+alignment-1 blocks.
1271 */
1272 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1273 args->maxlen + args->alignment - 1, &i)))
1274 goto error0;
1275
1276 /*
1277 * If none or we have busy extents that we cannot allocate from, then
1278 * we have to settle for a smaller extent. In the case that there are
1279 * no large extents, this will return the last entry in the tree unless
1280 * the tree is empty. In the case that there are only busy large
1281 * extents, this will return the largest small extent unless there
1282 * are no smaller extents available.
1283 */
1284 if (!i || forced > 1) {
1285 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1286 &fbno, &flen, &i);
1287 if (error)
1288 goto error0;
1289 if (i == 0 || flen == 0) {
1290 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1291 trace_xfs_alloc_size_noentry(args);
1292 return 0;
1293 }
1294 ASSERT(i == 1);
1295 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1296 } else {
1297 /*
1298 * Search for a non-busy extent that is large enough.
1299 * If we are at low space, don't check, or if we fall of
1300 * the end of the btree, turn off the busy check and
1301 * restart.
1302 */
1303 for (;;) {
1304 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1305 if (error)
1306 goto error0;
1307 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1308
1309 xfs_alloc_compute_aligned(args, fbno, flen,
1310 &rbno, &rlen);
1311
1312 if (rlen >= args->maxlen)
1313 break;
1314
1315 error = xfs_btree_increment(cnt_cur, 0, &i);
1316 if (error)
1317 goto error0;
1318 if (i == 0) {
1319 /*
1320 * Our only valid extents must have been busy.
1321 * Make it unbusy by forcing the log out and
1322 * retrying. If we've been here before, forcing
1323 * the log isn't making the extents available,
1324 * which means they have probably been freed in
1325 * this transaction. In that case, we have to
1326 * give up on them and we'll attempt a minlen
1327 * allocation the next time around.
1328 */
1329 xfs_btree_del_cursor(cnt_cur,
1330 XFS_BTREE_NOERROR);
1331 trace_xfs_alloc_size_busy(args);
1332 if (!forced++)
1333 xfs_log_force(args->mp, XFS_LOG_SYNC);
1334 goto restart;
1335 }
1336 }
1337 }
1338
1339 /*
1340 * In the first case above, we got the last entry in the
1341 * by-size btree. Now we check to see if the space hits maxlen
1342 * once aligned; if not, we search left for something better.
1343 * This can't happen in the second case above.
1344 */
1345 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1346 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1347 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
1348 if (rlen < args->maxlen) {
1349 xfs_agblock_t bestfbno;
1350 xfs_extlen_t bestflen;
1351 xfs_agblock_t bestrbno;
1352 xfs_extlen_t bestrlen;
1353
1354 bestrlen = rlen;
1355 bestrbno = rbno;
1356 bestflen = flen;
1357 bestfbno = fbno;
1358 for (;;) {
1359 if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1360 goto error0;
1361 if (i == 0)
1362 break;
1363 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1364 &i)))
1365 goto error0;
1366 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1367 if (flen < bestrlen)
1368 break;
1369 xfs_alloc_compute_aligned(args, fbno, flen,
1370 &rbno, &rlen);
1371 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1372 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1373 (rlen <= flen && rbno + rlen <= fbno + flen),
1374 error0);
1375 if (rlen > bestrlen) {
1376 bestrlen = rlen;
1377 bestrbno = rbno;
1378 bestflen = flen;
1379 bestfbno = fbno;
1380 if (rlen == args->maxlen)
1381 break;
1382 }
1383 }
1384 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1385 &i)))
1386 goto error0;
1387 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1388 rlen = bestrlen;
1389 rbno = bestrbno;
1390 flen = bestflen;
1391 fbno = bestfbno;
1392 }
1393 args->wasfromfl = 0;
1394 /*
1395 * Fix up the length.
1396 */
1397 args->len = rlen;
1398 if (rlen < args->minlen) {
1399 if (!forced++) {
1400 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1401 trace_xfs_alloc_size_busy(args);
1402 xfs_log_force(args->mp, XFS_LOG_SYNC);
1403 goto restart;
1404 }
1405 goto out_nominleft;
1406 }
1407 xfs_alloc_fix_len(args);
1408
1409 if (!xfs_alloc_fix_minleft(args))
1410 goto out_nominleft;
1411 rlen = args->len;
1412 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
1413 /*
1414 * Allocate and initialize a cursor for the by-block tree.
1415 */
1416 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1417 args->agno, XFS_BTNUM_BNO);
1418 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1419 rbno, rlen, XFSA_FIXUP_CNT_OK)))
1420 goto error0;
1421 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1422 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1423 cnt_cur = bno_cur = NULL;
1424 args->len = rlen;
1425 args->agbno = rbno;
1426 XFS_WANT_CORRUPTED_GOTO(
1427 args->agbno + args->len <=
1428 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
1429 error0);
1430 trace_xfs_alloc_size_done(args);
1431 return 0;
1432
1433error0:
1434 trace_xfs_alloc_size_error(args);
1435 if (cnt_cur)
1436 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1437 if (bno_cur)
1438 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1439 return error;
1440
1441out_nominleft:
1442 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1443 trace_xfs_alloc_size_nominleft(args);
1444 args->agbno = NULLAGBLOCK;
1445 return 0;
1446}
1447
1448/*
1449 * Deal with the case where only small freespaces remain.
1450 * Either return the contents of the last freespace record,
1451 * or allocate space from the freelist if there is nothing in the tree.
1452 */
1453STATIC int /* error */
1454xfs_alloc_ag_vextent_small(
1455 xfs_alloc_arg_t *args, /* allocation argument structure */
1456 xfs_btree_cur_t *ccur, /* by-size cursor */
1457 xfs_agblock_t *fbnop, /* result block number */
1458 xfs_extlen_t *flenp, /* result length */
1459 int *stat) /* status: 0-freelist, 1-normal/none */
1460{
1461 int error;
1462 xfs_agblock_t fbno;
1463 xfs_extlen_t flen;
1464 int i;
1465
1466 if ((error = xfs_btree_decrement(ccur, 0, &i)))
1467 goto error0;
1468 if (i) {
1469 if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
1470 goto error0;
1471 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1472 }
1473 /*
1474 * Nothing in the btree, try the freelist. Make sure
1475 * to respect minleft even when pulling from the
1476 * freelist.
1477 */
1478 else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
1479 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
1480 > args->minleft)) {
1481 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
1482 if (error)
1483 goto error0;
1484 if (fbno != NULLAGBLOCK) {
1485 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
1486 args->userdata);
1487
1488 if (args->userdata) {
1489 xfs_buf_t *bp;
1490
1491 bp = xfs_btree_get_bufs(args->mp, args->tp,
1492 args->agno, fbno, 0);
1493 xfs_trans_binval(args->tp, bp);
1494 }
1495 args->len = 1;
1496 args->agbno = fbno;
1497 XFS_WANT_CORRUPTED_GOTO(
1498 args->agbno + args->len <=
1499 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
1500 error0);
1501 args->wasfromfl = 1;
1502 trace_xfs_alloc_small_freelist(args);
1503 *stat = 0;
1504 return 0;
1505 }
1506 /*
1507 * Nothing in the freelist.
1508 */
1509 else
1510 flen = 0;
1511 }
1512 /*
1513 * Can't allocate from the freelist for some reason.
1514 */
1515 else {
1516 fbno = NULLAGBLOCK;
1517 flen = 0;
1518 }
1519 /*
1520 * Can't do the allocation, give up.
1521 */
1522 if (flen < args->minlen) {
1523 args->agbno = NULLAGBLOCK;
1524 trace_xfs_alloc_small_notenough(args);
1525 flen = 0;
1526 }
1527 *fbnop = fbno;
1528 *flenp = flen;
1529 *stat = 1;
1530 trace_xfs_alloc_small_done(args);
1531 return 0;
1532
1533error0:
1534 trace_xfs_alloc_small_error(args);
1535 return error;
1536}
1537
1538/*
1539 * Free the extent starting at agno/bno for length.
1540 */
1541STATIC int /* error */
1542xfs_free_ag_extent(
1543 xfs_trans_t *tp, /* transaction pointer */
1544 xfs_buf_t *agbp, /* buffer for a.g. freelist header */
1545 xfs_agnumber_t agno, /* allocation group number */
1546 xfs_agblock_t bno, /* starting block number */
1547 xfs_extlen_t len, /* length of extent */
1548 int isfl) /* set if is freelist blocks - no sb acctg */
1549{
1550 xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
1551 xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
1552 int error; /* error return value */
1553 xfs_agblock_t gtbno; /* start of right neighbor block */
1554 xfs_extlen_t gtlen; /* length of right neighbor block */
1555 int haveleft; /* have a left neighbor block */
1556 int haveright; /* have a right neighbor block */
1557 int i; /* temp, result code */
1558 xfs_agblock_t ltbno; /* start of left neighbor block */
1559 xfs_extlen_t ltlen; /* length of left neighbor block */
1560 xfs_mount_t *mp; /* mount point struct for filesystem */
1561 xfs_agblock_t nbno; /* new starting block of freespace */
1562 xfs_extlen_t nlen; /* new length of freespace */
1563 xfs_perag_t *pag; /* per allocation group data */
1564
1565 mp = tp->t_mountp;
1566 /*
1567 * Allocate and initialize a cursor for the by-block btree.
1568 */
1569 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
1570 cnt_cur = NULL;
1571 /*
1572 * Look for a neighboring block on the left (lower block numbers)
1573 * that is contiguous with this space.
1574 */
1575 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1576 goto error0;
1577 if (haveleft) {
1578 /*
1579 * There is a block to our left.
1580 */
1581 if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1582 goto error0;
1583 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1584 /*
1585 * It's not contiguous, though.
1586 */
1587 if (ltbno + ltlen < bno)
1588 haveleft = 0;
1589 else {
1590 /*
1591 * If this failure happens the request to free this
1592 * space was invalid, it's (partly) already free.
1593 * Very bad.
1594 */
1595 XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
1596 }
1597 }
1598 /*
1599 * Look for a neighboring block on the right (higher block numbers)
1600 * that is contiguous with this space.
1601 */
1602 if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1603 goto error0;
1604 if (haveright) {
1605 /*
1606 * There is a block to our right.
1607 */
1608 if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1609 goto error0;
1610 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1611 /*
1612 * It's not contiguous, though.
1613 */
1614 if (bno + len < gtbno)
1615 haveright = 0;
1616 else {
1617 /*
1618 * If this failure happens the request to free this
1619 * space was invalid, it's (partly) already free.
1620 * Very bad.
1621 */
1622 XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
1623 }
1624 }
1625 /*
1626 * Now allocate and initialize a cursor for the by-size tree.
1627 */
1628 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
1629 /*
1630 * Have both left and right contiguous neighbors.
1631 * Merge all three into a single free block.
1632 */
1633 if (haveleft && haveright) {
1634 /*
1635 * Delete the old by-size entry on the left.
1636 */
1637 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1638 goto error0;
1639 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1640 if ((error = xfs_btree_delete(cnt_cur, &i)))
1641 goto error0;
1642 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1643 /*
1644 * Delete the old by-size entry on the right.
1645 */
1646 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
1647 goto error0;
1648 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1649 if ((error = xfs_btree_delete(cnt_cur, &i)))
1650 goto error0;
1651 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1652 /*
1653 * Delete the old by-block entry for the right block.
1654 */
1655 if ((error = xfs_btree_delete(bno_cur, &i)))
1656 goto error0;
1657 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1658 /*
1659 * Move the by-block cursor back to the left neighbor.
1660 */
1661 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
1662 goto error0;
1663 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1664#ifdef DEBUG
1665 /*
1666 * Check that this is the right record: delete didn't
1667 * mangle the cursor.
1668 */
1669 {
1670 xfs_agblock_t xxbno;
1671 xfs_extlen_t xxlen;
1672
1673 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
1674 &i)))
1675 goto error0;
1676 XFS_WANT_CORRUPTED_GOTO(
1677 i == 1 && xxbno == ltbno && xxlen == ltlen,
1678 error0);
1679 }
1680#endif
1681 /*
1682 * Update remaining by-block entry to the new, joined block.
1683 */
1684 nbno = ltbno;
1685 nlen = len + ltlen + gtlen;
1686 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
1687 goto error0;
1688 }
1689 /*
1690 * Have only a left contiguous neighbor.
1691 * Merge it together with the new freespace.
1692 */
1693 else if (haveleft) {
1694 /*
1695 * Delete the old by-size entry on the left.
1696 */
1697 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1698 goto error0;
1699 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1700 if ((error = xfs_btree_delete(cnt_cur, &i)))
1701 goto error0;
1702 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1703 /*
1704 * Back up the by-block cursor to the left neighbor, and
1705 * update its length.
1706 */
1707 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
1708 goto error0;
1709 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1710 nbno = ltbno;
1711 nlen = len + ltlen;
1712 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
1713 goto error0;
1714 }
1715 /*
1716 * Have only a right contiguous neighbor.
1717 * Merge it together with the new freespace.
1718 */
1719 else if (haveright) {
1720 /*
1721 * Delete the old by-size entry on the right.
1722 */
1723 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
1724 goto error0;
1725 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1726 if ((error = xfs_btree_delete(cnt_cur, &i)))
1727 goto error0;
1728 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1729 /*
1730 * Update the starting block and length of the right
1731 * neighbor in the by-block tree.
1732 */
1733 nbno = bno;
1734 nlen = len + gtlen;
1735 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
1736 goto error0;
1737 }
1738 /*
1739 * No contiguous neighbors.
1740 * Insert the new freespace into the by-block tree.
1741 */
1742 else {
1743 nbno = bno;
1744 nlen = len;
1745 if ((error = xfs_btree_insert(bno_cur, &i)))
1746 goto error0;
1747 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1748 }
1749 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1750 bno_cur = NULL;
1751 /*
1752 * In all cases we need to insert the new freespace in the by-size tree.
1753 */
1754 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
1755 goto error0;
1756 XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
1757 if ((error = xfs_btree_insert(cnt_cur, &i)))
1758 goto error0;
1759 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1760 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1761 cnt_cur = NULL;
1762
1763 /*
1764 * Update the freespace totals in the ag and superblock.
1765 */
1766 pag = xfs_perag_get(mp, agno);
1767 error = xfs_alloc_update_counters(tp, pag, agbp, len);
1768 xfs_perag_put(pag);
1769 if (error)
1770 goto error0;
1771
1772 if (!isfl)
1773 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1774 XFS_STATS_INC(xs_freex);
1775 XFS_STATS_ADD(xs_freeb, len);
1776
1777 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
1778
1779 return 0;
1780
1781 error0:
1782 trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
1783 if (bno_cur)
1784 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1785 if (cnt_cur)
1786 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1787 return error;
1788}
1789
1790/*
1791 * Visible (exported) allocation/free functions.
1792 * Some of these are used just by xfs_alloc_btree.c and this file.
1793 */
1794
1795/*
1796 * Compute and fill in value of m_ag_maxlevels.
1797 */
1798void
1799xfs_alloc_compute_maxlevels(
1800 xfs_mount_t *mp) /* file system mount structure */
1801{
1802 int level;
1803 uint maxblocks;
1804 uint maxleafents;
1805 int minleafrecs;
1806 int minnoderecs;
1807
1808 maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
1809 minleafrecs = mp->m_alloc_mnr[0];
1810 minnoderecs = mp->m_alloc_mnr[1];
1811 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
1812 for (level = 1; maxblocks > 1; level++)
1813 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
1814 mp->m_ag_maxlevels = level;
1815}
1816
1817/*
1818 * Find the length of the longest extent in an AG.
1819 */
1820xfs_extlen_t
1821xfs_alloc_longest_free_extent(
1822 struct xfs_mount *mp,
1823 struct xfs_perag *pag)
1824{
1825 xfs_extlen_t need, delta = 0;
1826
1827 need = XFS_MIN_FREELIST_PAG(pag, mp);
1828 if (need > pag->pagf_flcount)
1829 delta = need - pag->pagf_flcount;
1830
1831 if (pag->pagf_longest > delta)
1832 return pag->pagf_longest - delta;
1833 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
1834}
1835
1836/*
1837 * Decide whether to use this allocation group for this allocation.
1838 * If so, fix up the btree freelist's size.
1839 */
1840STATIC int /* error */
1841xfs_alloc_fix_freelist(
1842 xfs_alloc_arg_t *args, /* allocation argument structure */
1843 int flags) /* XFS_ALLOC_FLAG_... */
1844{
1845 xfs_buf_t *agbp; /* agf buffer pointer */
1846 xfs_agf_t *agf; /* a.g. freespace structure pointer */
1847 xfs_buf_t *agflbp;/* agfl buffer pointer */
1848 xfs_agblock_t bno; /* freelist block */
1849 xfs_extlen_t delta; /* new blocks needed in freelist */
1850 int error; /* error result code */
1851 xfs_extlen_t longest;/* longest extent in allocation group */
1852 xfs_mount_t *mp; /* file system mount point structure */
1853 xfs_extlen_t need; /* total blocks needed in freelist */
1854 xfs_perag_t *pag; /* per-ag information structure */
1855 xfs_alloc_arg_t targs; /* local allocation arguments */
1856 xfs_trans_t *tp; /* transaction pointer */
1857
1858 mp = args->mp;
1859
1860 pag = args->pag;
1861 tp = args->tp;
1862 if (!pag->pagf_init) {
1863 if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
1864 &agbp)))
1865 return error;
1866 if (!pag->pagf_init) {
1867 ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
1868 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
1869 args->agbp = NULL;
1870 return 0;
1871 }
1872 } else
1873 agbp = NULL;
1874
1875 /*
1876 * If this is a metadata preferred pag and we are user data
1877 * then try somewhere else if we are not being asked to
1878 * try harder at this point
1879 */
1880 if (pag->pagf_metadata && args->userdata &&
1881 (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
1882 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
1883 args->agbp = NULL;
1884 return 0;
1885 }
1886
1887 if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
1888 /*
1889 * If it looks like there isn't a long enough extent, or enough
1890 * total blocks, reject it.
1891 */
1892 need = XFS_MIN_FREELIST_PAG(pag, mp);
1893 longest = xfs_alloc_longest_free_extent(mp, pag);
1894 if ((args->minlen + args->alignment + args->minalignslop - 1) >
1895 longest ||
1896 ((int)(pag->pagf_freeblks + pag->pagf_flcount -
1897 need - args->total) < (int)args->minleft)) {
1898 if (agbp)
1899 xfs_trans_brelse(tp, agbp);
1900 args->agbp = NULL;
1901 return 0;
1902 }
1903 }
1904
1905 /*
1906 * Get the a.g. freespace buffer.
1907 * Can fail if we're not blocking on locks, and it's held.
1908 */
1909 if (agbp == NULL) {
1910 if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
1911 &agbp)))
1912 return error;
1913 if (agbp == NULL) {
1914 ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
1915 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
1916 args->agbp = NULL;
1917 return 0;
1918 }
1919 }
1920 /*
1921 * Figure out how many blocks we should have in the freelist.
1922 */
1923 agf = XFS_BUF_TO_AGF(agbp);
1924 need = XFS_MIN_FREELIST(agf, mp);
1925 /*
1926 * If there isn't enough total or single-extent, reject it.
1927 */
1928 if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
1929 delta = need > be32_to_cpu(agf->agf_flcount) ?
1930 (need - be32_to_cpu(agf->agf_flcount)) : 0;
1931 longest = be32_to_cpu(agf->agf_longest);
1932 longest = (longest > delta) ? (longest - delta) :
1933 (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
1934 if ((args->minlen + args->alignment + args->minalignslop - 1) >
1935 longest ||
1936 ((int)(be32_to_cpu(agf->agf_freeblks) +
1937 be32_to_cpu(agf->agf_flcount) - need - args->total) <
1938 (int)args->minleft)) {
1939 xfs_trans_brelse(tp, agbp);
1940 args->agbp = NULL;
1941 return 0;
1942 }
1943 }
1944 /*
1945 * Make the freelist shorter if it's too long.
1946 */
1947 while (be32_to_cpu(agf->agf_flcount) > need) {
1948 xfs_buf_t *bp;
1949
1950 error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
1951 if (error)
1952 return error;
1953 if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
1954 return error;
1955 bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
1956 xfs_trans_binval(tp, bp);
1957 }
1958 /*
1959 * Initialize the args structure.
1960 */
1961 memset(&targs, 0, sizeof(targs));
1962 targs.tp = tp;
1963 targs.mp = mp;
1964 targs.agbp = agbp;
1965 targs.agno = args->agno;
1966 targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
1967 targs.type = XFS_ALLOCTYPE_THIS_AG;
1968 targs.pag = pag;
1969 if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
1970 return error;
1971 /*
1972 * Make the freelist longer if it's too short.
1973 */
1974 while (be32_to_cpu(agf->agf_flcount) < need) {
1975 targs.agbno = 0;
1976 targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
1977 /*
1978 * Allocate as many blocks as possible at once.
1979 */
1980 if ((error = xfs_alloc_ag_vextent(&targs))) {
1981 xfs_trans_brelse(tp, agflbp);
1982 return error;
1983 }
1984 /*
1985 * Stop if we run out. Won't happen if callers are obeying
1986 * the restrictions correctly. Can happen for free calls
1987 * on a completely full ag.
1988 */
1989 if (targs.agbno == NULLAGBLOCK) {
1990 if (flags & XFS_ALLOC_FLAG_FREEING)
1991 break;
1992 xfs_trans_brelse(tp, agflbp);
1993 args->agbp = NULL;
1994 return 0;
1995 }
1996 /*
1997 * Put each allocated block on the list.
1998 */
1999 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2000 error = xfs_alloc_put_freelist(tp, agbp,
2001 agflbp, bno, 0);
2002 if (error)
2003 return error;
2004 }
2005 }
2006 xfs_trans_brelse(tp, agflbp);
2007 args->agbp = agbp;
2008 return 0;
2009}
2010
2011/*
2012 * Get a block from the freelist.
2013 * Returns with the buffer for the block gotten.
2014 */
2015int /* error */
2016xfs_alloc_get_freelist(
2017 xfs_trans_t *tp, /* transaction pointer */
2018 xfs_buf_t *agbp, /* buffer containing the agf structure */
2019 xfs_agblock_t *bnop, /* block address retrieved from freelist */
2020 int btreeblk) /* destination is a AGF btree */
2021{
2022 xfs_agf_t *agf; /* a.g. freespace structure */
2023 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
2024 xfs_agblock_t bno; /* block number returned */
2025 __be32 *agfl_bno;
2026 int error;
2027 int logflags;
2028 xfs_mount_t *mp = tp->t_mountp;
2029 xfs_perag_t *pag; /* per allocation group data */
2030
2031 /*
2032 * Freelist is empty, give up.
2033 */
2034 agf = XFS_BUF_TO_AGF(agbp);
2035 if (!agf->agf_flcount) {
2036 *bnop = NULLAGBLOCK;
2037 return 0;
2038 }
2039 /*
2040 * Read the array of free blocks.
2041 */
2042 error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
2043 &agflbp);
2044 if (error)
2045 return error;
2046
2047
2048 /*
2049 * Get the block number and update the data structures.
2050 */
2051 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
2052 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2053 be32_add_cpu(&agf->agf_flfirst, 1);
2054 xfs_trans_brelse(tp, agflbp);
2055 if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
2056 agf->agf_flfirst = 0;
2057
2058 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2059 be32_add_cpu(&agf->agf_flcount, -1);
2060 xfs_trans_agflist_delta(tp, -1);
2061 pag->pagf_flcount--;
2062 xfs_perag_put(pag);
2063
2064 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2065 if (btreeblk) {
2066 be32_add_cpu(&agf->agf_btreeblks, 1);
2067 pag->pagf_btreeblks++;
2068 logflags |= XFS_AGF_BTREEBLKS;
2069 }
2070
2071 xfs_alloc_log_agf(tp, agbp, logflags);
2072 *bnop = bno;
2073
2074 return 0;
2075}
2076
2077/*
2078 * Log the given fields from the agf structure.
2079 */
2080void
2081xfs_alloc_log_agf(
2082 xfs_trans_t *tp, /* transaction pointer */
2083 xfs_buf_t *bp, /* buffer for a.g. freelist header */
2084 int fields) /* mask of fields to be logged (XFS_AGF_...) */
2085{
2086 int first; /* first byte offset */
2087 int last; /* last byte offset */
2088 static const short offsets[] = {
2089 offsetof(xfs_agf_t, agf_magicnum),
2090 offsetof(xfs_agf_t, agf_versionnum),
2091 offsetof(xfs_agf_t, agf_seqno),
2092 offsetof(xfs_agf_t, agf_length),
2093 offsetof(xfs_agf_t, agf_roots[0]),
2094 offsetof(xfs_agf_t, agf_levels[0]),
2095 offsetof(xfs_agf_t, agf_flfirst),
2096 offsetof(xfs_agf_t, agf_fllast),
2097 offsetof(xfs_agf_t, agf_flcount),
2098 offsetof(xfs_agf_t, agf_freeblks),
2099 offsetof(xfs_agf_t, agf_longest),
2100 offsetof(xfs_agf_t, agf_btreeblks),
2101 offsetof(xfs_agf_t, agf_uuid),
2102 sizeof(xfs_agf_t)
2103 };
2104
2105 trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
2106
2107 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2108
2109 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2110 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2111}
2112
2113/*
2114 * Interface for inode allocation to force the pag data to be initialized.
2115 */
2116int /* error */
2117xfs_alloc_pagf_init(
2118 xfs_mount_t *mp, /* file system mount structure */
2119 xfs_trans_t *tp, /* transaction pointer */
2120 xfs_agnumber_t agno, /* allocation group number */
2121 int flags) /* XFS_ALLOC_FLAGS_... */
2122{
2123 xfs_buf_t *bp;
2124 int error;
2125
2126 if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
2127 return error;
2128 if (bp)
2129 xfs_trans_brelse(tp, bp);
2130 return 0;
2131}
2132
2133/*
2134 * Put the block on the freelist for the allocation group.
2135 */
2136int /* error */
2137xfs_alloc_put_freelist(
2138 xfs_trans_t *tp, /* transaction pointer */
2139 xfs_buf_t *agbp, /* buffer for a.g. freelist header */
2140 xfs_buf_t *agflbp,/* buffer for a.g. free block array */
2141 xfs_agblock_t bno, /* block being freed */
2142 int btreeblk) /* block came from a AGF btree */
2143{
2144 xfs_agf_t *agf; /* a.g. freespace structure */
2145 __be32 *blockp;/* pointer to array entry */
2146 int error;
2147 int logflags;
2148 xfs_mount_t *mp; /* mount structure */
2149 xfs_perag_t *pag; /* per allocation group data */
2150 __be32 *agfl_bno;
2151 int startoff;
2152
2153 agf = XFS_BUF_TO_AGF(agbp);
2154 mp = tp->t_mountp;
2155
2156 if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
2157 be32_to_cpu(agf->agf_seqno), &agflbp)))
2158 return error;
2159 be32_add_cpu(&agf->agf_fllast, 1);
2160 if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
2161 agf->agf_fllast = 0;
2162
2163 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2164 be32_add_cpu(&agf->agf_flcount, 1);
2165 xfs_trans_agflist_delta(tp, 1);
2166 pag->pagf_flcount++;
2167
2168 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2169 if (btreeblk) {
2170 be32_add_cpu(&agf->agf_btreeblks, -1);
2171 pag->pagf_btreeblks--;
2172 logflags |= XFS_AGF_BTREEBLKS;
2173 }
2174 xfs_perag_put(pag);
2175
2176 xfs_alloc_log_agf(tp, agbp, logflags);
2177
2178 ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
2179
2180 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
2181 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2182 *blockp = cpu_to_be32(bno);
2183 startoff = (char *)blockp - (char *)agflbp->b_addr;
2184
2185 xfs_alloc_log_agf(tp, agbp, logflags);
2186
2187 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2188 xfs_trans_log_buf(tp, agflbp, startoff,
2189 startoff + sizeof(xfs_agblock_t) - 1);
2190 return 0;
2191}
2192
2193static bool
2194xfs_agf_verify(
2195 struct xfs_mount *mp,
2196 struct xfs_buf *bp)
2197 {
2198 struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
2199
2200 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2201 !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
2202 return false;
2203
2204 if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
2205 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2206 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2207 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2208 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2209 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
2210 return false;
2211
2212 /*
2213 * during growfs operations, the perag is not fully initialised,
2214 * so we can't use it for any useful checking. growfs ensures we can't
2215 * use it by using uncached buffers that don't have the perag attached
2216 * so we can detect and avoid this problem.
2217 */
2218 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2219 return false;
2220
2221 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
2222 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2223 return false;
2224
2225 return true;;
2226
2227}
2228
2229static void
2230xfs_agf_read_verify(
2231 struct xfs_buf *bp)
2232{
2233 struct xfs_mount *mp = bp->b_target->bt_mount;
2234
2235 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2236 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2237 xfs_buf_ioerror(bp, -EFSBADCRC);
2238 else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
2239 XFS_ERRTAG_ALLOC_READ_AGF,
2240 XFS_RANDOM_ALLOC_READ_AGF))
2241 xfs_buf_ioerror(bp, -EFSCORRUPTED);
2242
2243 if (bp->b_error)
2244 xfs_verifier_error(bp);
2245}
2246
2247static void
2248xfs_agf_write_verify(
2249 struct xfs_buf *bp)
2250{
2251 struct xfs_mount *mp = bp->b_target->bt_mount;
2252 struct xfs_buf_log_item *bip = bp->b_fspriv;
2253
2254 if (!xfs_agf_verify(mp, bp)) {
2255 xfs_buf_ioerror(bp, -EFSCORRUPTED);
2256 xfs_verifier_error(bp);
2257 return;
2258 }
2259
2260 if (!xfs_sb_version_hascrc(&mp->m_sb))
2261 return;
2262
2263 if (bip)
2264 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2265
2266 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2267}
2268
2269const struct xfs_buf_ops xfs_agf_buf_ops = {
2270 .verify_read = xfs_agf_read_verify,
2271 .verify_write = xfs_agf_write_verify,
2272};
2273
2274/*
2275 * Read in the allocation group header (free/alloc section).
2276 */
2277int /* error */
2278xfs_read_agf(
2279 struct xfs_mount *mp, /* mount point structure */
2280 struct xfs_trans *tp, /* transaction pointer */
2281 xfs_agnumber_t agno, /* allocation group number */
2282 int flags, /* XFS_BUF_ */
2283 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2284{
2285 int error;
2286
2287 trace_xfs_read_agf(mp, agno);
2288
2289 ASSERT(agno != NULLAGNUMBER);
2290 error = xfs_trans_read_buf(
2291 mp, tp, mp->m_ddev_targp,
2292 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2293 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
2294 if (error)
2295 return error;
2296 if (!*bpp)
2297 return 0;
2298
2299 ASSERT(!(*bpp)->b_error);
2300 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2301 return 0;
2302}
2303
2304/*
2305 * Read in the allocation group header (free/alloc section).
2306 */
2307int /* error */
2308xfs_alloc_read_agf(
2309 struct xfs_mount *mp, /* mount point structure */
2310 struct xfs_trans *tp, /* transaction pointer */
2311 xfs_agnumber_t agno, /* allocation group number */
2312 int flags, /* XFS_ALLOC_FLAG_... */
2313 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2314{
2315 struct xfs_agf *agf; /* ag freelist header */
2316 struct xfs_perag *pag; /* per allocation group data */
2317 int error;
2318
2319 trace_xfs_alloc_read_agf(mp, agno);
2320
2321 ASSERT(agno != NULLAGNUMBER);
2322 error = xfs_read_agf(mp, tp, agno,
2323 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2324 bpp);
2325 if (error)
2326 return error;
2327 if (!*bpp)
2328 return 0;
2329 ASSERT(!(*bpp)->b_error);
2330
2331 agf = XFS_BUF_TO_AGF(*bpp);
2332 pag = xfs_perag_get(mp, agno);
2333 if (!pag->pagf_init) {
2334 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
2335 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
2336 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
2337 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
2338 pag->pagf_levels[XFS_BTNUM_BNOi] =
2339 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
2340 pag->pagf_levels[XFS_BTNUM_CNTi] =
2341 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2342 spin_lock_init(&pag->pagb_lock);
2343 pag->pagb_count = 0;
2344 pag->pagb_tree = RB_ROOT;
2345 pag->pagf_init = 1;
2346 }
2347#ifdef DEBUG
2348 else if (!XFS_FORCED_SHUTDOWN(mp)) {
2349 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
2350 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
2351 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
2352 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
2353 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
2354 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
2355 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
2356 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
2357 }
2358#endif
2359 xfs_perag_put(pag);
2360 return 0;
2361}
2362
2363/*
2364 * Allocate an extent (variable-size).
2365 * Depending on the allocation type, we either look in a single allocation
2366 * group or loop over the allocation groups to find the result.
2367 */
2368int /* error */
2369xfs_alloc_vextent(
2370 xfs_alloc_arg_t *args) /* allocation argument structure */
2371{
2372 xfs_agblock_t agsize; /* allocation group size */
2373 int error;
2374 int flags; /* XFS_ALLOC_FLAG_... locking flags */
2375 xfs_extlen_t minleft;/* minimum left value, temp copy */
2376 xfs_mount_t *mp; /* mount structure pointer */
2377 xfs_agnumber_t sagno; /* starting allocation group number */
2378 xfs_alloctype_t type; /* input allocation type */
2379 int bump_rotor = 0;
2380 int no_min = 0;
2381 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
2382
2383 mp = args->mp;
2384 type = args->otype = args->type;
2385 args->agbno = NULLAGBLOCK;
2386 /*
2387 * Just fix this up, for the case where the last a.g. is shorter
2388 * (or there's only one a.g.) and the caller couldn't easily figure
2389 * that out (xfs_bmap_alloc).
2390 */
2391 agsize = mp->m_sb.sb_agblocks;
2392 if (args->maxlen > agsize)
2393 args->maxlen = agsize;
2394 if (args->alignment == 0)
2395 args->alignment = 1;
2396 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
2397 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
2398 ASSERT(args->minlen <= args->maxlen);
2399 ASSERT(args->minlen <= agsize);
2400 ASSERT(args->mod < args->prod);
2401 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
2402 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
2403 args->minlen > args->maxlen || args->minlen > agsize ||
2404 args->mod >= args->prod) {
2405 args->fsbno = NULLFSBLOCK;
2406 trace_xfs_alloc_vextent_badargs(args);
2407 return 0;
2408 }
2409 minleft = args->minleft;
2410
2411 switch (type) {
2412 case XFS_ALLOCTYPE_THIS_AG:
2413 case XFS_ALLOCTYPE_NEAR_BNO:
2414 case XFS_ALLOCTYPE_THIS_BNO:
2415 /*
2416 * These three force us into a single a.g.
2417 */
2418 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2419 args->pag = xfs_perag_get(mp, args->agno);
2420 args->minleft = 0;
2421 error = xfs_alloc_fix_freelist(args, 0);
2422 args->minleft = minleft;
2423 if (error) {
2424 trace_xfs_alloc_vextent_nofix(args);
2425 goto error0;
2426 }
2427 if (!args->agbp) {
2428 trace_xfs_alloc_vextent_noagbp(args);
2429 break;
2430 }
2431 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2432 if ((error = xfs_alloc_ag_vextent(args)))
2433 goto error0;
2434 break;
2435 case XFS_ALLOCTYPE_START_BNO:
2436 /*
2437 * Try near allocation first, then anywhere-in-ag after
2438 * the first a.g. fails.
2439 */
2440 if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) &&
2441 (mp->m_flags & XFS_MOUNT_32BITINODES)) {
2442 args->fsbno = XFS_AGB_TO_FSB(mp,
2443 ((mp->m_agfrotor / rotorstep) %
2444 mp->m_sb.sb_agcount), 0);
2445 bump_rotor = 1;
2446 }
2447 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2448 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2449 /* FALLTHROUGH */
2450 case XFS_ALLOCTYPE_ANY_AG:
2451 case XFS_ALLOCTYPE_START_AG:
2452 case XFS_ALLOCTYPE_FIRST_AG:
2453 /*
2454 * Rotate through the allocation groups looking for a winner.
2455 */
2456 if (type == XFS_ALLOCTYPE_ANY_AG) {
2457 /*
2458 * Start with the last place we left off.
2459 */
2460 args->agno = sagno = (mp->m_agfrotor / rotorstep) %
2461 mp->m_sb.sb_agcount;
2462 args->type = XFS_ALLOCTYPE_THIS_AG;
2463 flags = XFS_ALLOC_FLAG_TRYLOCK;
2464 } else if (type == XFS_ALLOCTYPE_FIRST_AG) {
2465 /*
2466 * Start with allocation group given by bno.
2467 */
2468 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2469 args->type = XFS_ALLOCTYPE_THIS_AG;
2470 sagno = 0;
2471 flags = 0;
2472 } else {
2473 if (type == XFS_ALLOCTYPE_START_AG)
2474 args->type = XFS_ALLOCTYPE_THIS_AG;
2475 /*
2476 * Start with the given allocation group.
2477 */
2478 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2479 flags = XFS_ALLOC_FLAG_TRYLOCK;
2480 }
2481 /*
2482 * Loop over allocation groups twice; first time with
2483 * trylock set, second time without.
2484 */
2485 for (;;) {
2486 args->pag = xfs_perag_get(mp, args->agno);
2487 if (no_min) args->minleft = 0;
2488 error = xfs_alloc_fix_freelist(args, flags);
2489 args->minleft = minleft;
2490 if (error) {
2491 trace_xfs_alloc_vextent_nofix(args);
2492 goto error0;
2493 }
2494 /*
2495 * If we get a buffer back then the allocation will fly.
2496 */
2497 if (args->agbp) {
2498 if ((error = xfs_alloc_ag_vextent(args)))
2499 goto error0;
2500 break;
2501 }
2502
2503 trace_xfs_alloc_vextent_loopfailed(args);
2504
2505 /*
2506 * Didn't work, figure out the next iteration.
2507 */
2508 if (args->agno == sagno &&
2509 type == XFS_ALLOCTYPE_START_BNO)
2510 args->type = XFS_ALLOCTYPE_THIS_AG;
2511 /*
2512 * For the first allocation, we can try any AG to get
2513 * space. However, if we already have allocated a
2514 * block, we don't want to try AGs whose number is below
2515 * sagno. Otherwise, we may end up with out-of-order
2516 * locking of AGF, which might cause deadlock.
2517 */
2518 if (++(args->agno) == mp->m_sb.sb_agcount) {
2519 if (args->firstblock != NULLFSBLOCK)
2520 args->agno = sagno;
2521 else
2522 args->agno = 0;
2523 }
2524 /*
2525 * Reached the starting a.g., must either be done
2526 * or switch to non-trylock mode.
2527 */
2528 if (args->agno == sagno) {
2529 if (no_min == 1) {
2530 args->agbno = NULLAGBLOCK;
2531 trace_xfs_alloc_vextent_allfailed(args);
2532 break;
2533 }
2534 if (flags == 0) {
2535 no_min = 1;
2536 } else {
2537 flags = 0;
2538 if (type == XFS_ALLOCTYPE_START_BNO) {
2539 args->agbno = XFS_FSB_TO_AGBNO(mp,
2540 args->fsbno);
2541 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2542 }
2543 }
2544 }
2545 xfs_perag_put(args->pag);
2546 }
2547 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
2548 if (args->agno == sagno)
2549 mp->m_agfrotor = (mp->m_agfrotor + 1) %
2550 (mp->m_sb.sb_agcount * rotorstep);
2551 else
2552 mp->m_agfrotor = (args->agno * rotorstep + 1) %
2553 (mp->m_sb.sb_agcount * rotorstep);
2554 }
2555 break;
2556 default:
2557 ASSERT(0);
2558 /* NOTREACHED */
2559 }
2560 if (args->agbno == NULLAGBLOCK)
2561 args->fsbno = NULLFSBLOCK;
2562 else {
2563 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
2564#ifdef DEBUG
2565 ASSERT(args->len >= args->minlen);
2566 ASSERT(args->len <= args->maxlen);
2567 ASSERT(args->agbno % args->alignment == 0);
2568 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
2569 args->len);
2570#endif
2571 }
2572 xfs_perag_put(args->pag);
2573 return 0;
2574error0:
2575 xfs_perag_put(args->pag);
2576 return error;
2577}
2578
2579/*
2580 * Free an extent.
2581 * Just break up the extent address and hand off to xfs_free_ag_extent
2582 * after fixing up the freelist.
2583 */
2584int /* error */
2585xfs_free_extent(
2586 xfs_trans_t *tp, /* transaction pointer */
2587 xfs_fsblock_t bno, /* starting block number of extent */
2588 xfs_extlen_t len) /* length of extent */
2589{
2590 xfs_alloc_arg_t args;
2591 int error;
2592
2593 ASSERT(len != 0);
2594 memset(&args, 0, sizeof(xfs_alloc_arg_t));
2595 args.tp = tp;
2596 args.mp = tp->t_mountp;
2597
2598 /*
2599 * validate that the block number is legal - the enables us to detect
2600 * and handle a silent filesystem corruption rather than crashing.
2601 */
2602 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2603 if (args.agno >= args.mp->m_sb.sb_agcount)
2604 return -EFSCORRUPTED;
2605
2606 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2607 if (args.agbno >= args.mp->m_sb.sb_agblocks)
2608 return -EFSCORRUPTED;
2609
2610 args.pag = xfs_perag_get(args.mp, args.agno);
2611 ASSERT(args.pag);
2612
2613 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
2614 if (error)
2615 goto error0;
2616
2617 /* validate the extent size is legal now we have the agf locked */
2618 if (args.agbno + len >
2619 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
2620 error = -EFSCORRUPTED;
2621 goto error0;
2622 }
2623
2624 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2625 if (!error)
2626 xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
2627error0:
2628 xfs_perag_put(args.pag);
2629 return error;
2630}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
new file mode 100644
index 000000000000..feacb061bab7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -0,0 +1,234 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_ALLOC_H__
19#define __XFS_ALLOC_H__
20
21struct xfs_buf;
22struct xfs_btree_cur;
23struct xfs_mount;
24struct xfs_perag;
25struct xfs_trans;
26
27extern struct workqueue_struct *xfs_alloc_wq;
28
29/*
30 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
31 */
32#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */
33#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */
34#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */
35#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */
36#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */
37#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */
38#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */
39
40/* this should become an enum again when the tracing code is fixed */
41typedef unsigned int xfs_alloctype_t;
42
43#define XFS_ALLOC_TYPES \
44 { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \
45 { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \
46 { XFS_ALLOCTYPE_START_AG, "START_AG" }, \
47 { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \
48 { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \
49 { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \
50 { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" }
51
52/*
53 * Flags for xfs_alloc_fix_freelist.
54 */
55#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
56#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
57
58/*
59 * In order to avoid ENOSPC-related deadlock caused by
60 * out-of-order locking of AGF buffer (PV 947395), we place
61 * constraints on the relationship among actual allocations for
62 * data blocks, freelist blocks, and potential file data bmap
63 * btree blocks. However, these restrictions may result in no
64 * actual space allocated for a delayed extent, for example, a data
65 * block in a certain AG is allocated but there is no additional
66 * block for the additional bmap btree block due to a split of the
67 * bmap btree of the file. The result of this may lead to an
68 * infinite loop in xfssyncd when the file gets flushed to disk and
69 * all delayed extents need to be actually allocated. To get around
70 * this, we explicitly set aside a few blocks which will not be
71 * reserved in delayed allocation. Considering the minimum number of
72 * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
73 * btree requires 1 fsb, so we set the number of set-aside blocks
74 * to 4 + 4*agcount.
75 */
76#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
77
78/*
79 * When deciding how much space to allocate out of an AG, we limit the
80 * allocation maximum size to the size the AG. However, we cannot use all the
81 * blocks in the AG - some are permanently used by metadata. These
82 * blocks are generally:
83 * - the AG superblock, AGF, AGI and AGFL
84 * - the AGF (bno and cnt) and AGI btree root blocks
85 * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
86 *
87 * The AG headers are sector sized, so the amount of space they take up is
88 * dependent on filesystem geometry. The others are all single blocks.
89 */
90#define XFS_ALLOC_AG_MAX_USABLE(mp) \
91 ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
92
93
94/*
95 * Argument structure for xfs_alloc routines.
96 * This is turned into a structure to avoid having 20 arguments passed
97 * down several levels of the stack.
98 */
99typedef struct xfs_alloc_arg {
100 struct xfs_trans *tp; /* transaction pointer */
101 struct xfs_mount *mp; /* file system mount point */
102 struct xfs_buf *agbp; /* buffer for a.g. freelist header */
103 struct xfs_perag *pag; /* per-ag struct for this agno */
104 xfs_fsblock_t fsbno; /* file system block number */
105 xfs_agnumber_t agno; /* allocation group number */
106 xfs_agblock_t agbno; /* allocation group-relative block # */
107 xfs_extlen_t minlen; /* minimum size of extent */
108 xfs_extlen_t maxlen; /* maximum size of extent */
109 xfs_extlen_t mod; /* mod value for extent size */
110 xfs_extlen_t prod; /* prod value for extent size */
111 xfs_extlen_t minleft; /* min blocks must be left after us */
112 xfs_extlen_t total; /* total blocks needed in xaction */
113 xfs_extlen_t alignment; /* align answer to multiple of this */
114 xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */
115 xfs_extlen_t len; /* output: actual size of extent */
116 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
117 xfs_alloctype_t otype; /* original allocation type */
118 char wasdel; /* set if allocation was prev delayed */
119 char wasfromfl; /* set if allocation is from freelist */
120 char isfl; /* set if is freelist blocks - !acctg */
121 char userdata; /* set if this is user data */
122 xfs_fsblock_t firstblock; /* io first block allocated */
123} xfs_alloc_arg_t;
124
125/*
126 * Defines for userdata
127 */
128#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
129#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
130
131/*
132 * Find the length of the longest extent in an AG.
133 */
134xfs_extlen_t
135xfs_alloc_longest_free_extent(struct xfs_mount *mp,
136 struct xfs_perag *pag);
137
138/*
139 * Compute and fill in value of m_ag_maxlevels.
140 */
141void
142xfs_alloc_compute_maxlevels(
143 struct xfs_mount *mp); /* file system mount structure */
144
145/*
146 * Get a block from the freelist.
147 * Returns with the buffer for the block gotten.
148 */
149int /* error */
150xfs_alloc_get_freelist(
151 struct xfs_trans *tp, /* transaction pointer */
152 struct xfs_buf *agbp, /* buffer containing the agf structure */
153 xfs_agblock_t *bnop, /* block address retrieved from freelist */
154 int btreeblk); /* destination is a AGF btree */
155
156/*
157 * Log the given fields from the agf structure.
158 */
159void
160xfs_alloc_log_agf(
161 struct xfs_trans *tp, /* transaction pointer */
162 struct xfs_buf *bp, /* buffer for a.g. freelist header */
163 int fields);/* mask of fields to be logged (XFS_AGF_...) */
164
165/*
166 * Interface for inode allocation to force the pag data to be initialized.
167 */
168int /* error */
169xfs_alloc_pagf_init(
170 struct xfs_mount *mp, /* file system mount structure */
171 struct xfs_trans *tp, /* transaction pointer */
172 xfs_agnumber_t agno, /* allocation group number */
173 int flags); /* XFS_ALLOC_FLAGS_... */
174
175/*
176 * Put the block on the freelist for the allocation group.
177 */
178int /* error */
179xfs_alloc_put_freelist(
180 struct xfs_trans *tp, /* transaction pointer */
181 struct xfs_buf *agbp, /* buffer for a.g. freelist header */
182 struct xfs_buf *agflbp,/* buffer for a.g. free block array */
183 xfs_agblock_t bno, /* block being freed */
184 int btreeblk); /* owner was a AGF btree */
185
186/*
187 * Read in the allocation group header (free/alloc section).
188 */
189int /* error */
190xfs_alloc_read_agf(
191 struct xfs_mount *mp, /* mount point structure */
192 struct xfs_trans *tp, /* transaction pointer */
193 xfs_agnumber_t agno, /* allocation group number */
194 int flags, /* XFS_ALLOC_FLAG_... */
195 struct xfs_buf **bpp); /* buffer for the ag freelist header */
196
197/*
198 * Allocate an extent (variable-size).
199 */
200int /* error */
201xfs_alloc_vextent(
202 xfs_alloc_arg_t *args); /* allocation argument structure */
203
204/*
205 * Free an extent.
206 */
207int /* error */
208xfs_free_extent(
209 struct xfs_trans *tp, /* transaction pointer */
210 xfs_fsblock_t bno, /* starting block number of extent */
211 xfs_extlen_t len); /* length of extent */
212
213int /* error */
214xfs_alloc_lookup_le(
215 struct xfs_btree_cur *cur, /* btree cursor */
216 xfs_agblock_t bno, /* starting block of extent */
217 xfs_extlen_t len, /* length of extent */
218 int *stat); /* success/failure */
219
220int /* error */
221xfs_alloc_lookup_ge(
222 struct xfs_btree_cur *cur, /* btree cursor */
223 xfs_agblock_t bno, /* starting block of extent */
224 xfs_extlen_t len, /* length of extent */
225 int *stat); /* success/failure */
226
227int /* error */
228xfs_alloc_get_rec(
229 struct xfs_btree_cur *cur, /* btree cursor */
230 xfs_agblock_t *bno, /* output: starting block of extent */
231 xfs_extlen_t *len, /* output: length of extent */
232 int *stat); /* output: success/failure */
233
234#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
new file mode 100644
index 000000000000..e0e83e24d3ef
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -0,0 +1,504 @@
1/*
2 * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27#include "xfs_btree.h"
28#include "xfs_alloc_btree.h"
29#include "xfs_alloc.h"
30#include "xfs_extent_busy.h"
31#include "xfs_error.h"
32#include "xfs_trace.h"
33#include "xfs_cksum.h"
34#include "xfs_trans.h"
35
36
37STATIC struct xfs_btree_cur *
38xfs_allocbt_dup_cursor(
39 struct xfs_btree_cur *cur)
40{
41 return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
42 cur->bc_private.a.agbp, cur->bc_private.a.agno,
43 cur->bc_btnum);
44}
45
46STATIC void
47xfs_allocbt_set_root(
48 struct xfs_btree_cur *cur,
49 union xfs_btree_ptr *ptr,
50 int inc)
51{
52 struct xfs_buf *agbp = cur->bc_private.a.agbp;
53 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
54 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
55 int btnum = cur->bc_btnum;
56 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
57
58 ASSERT(ptr->s != 0);
59
60 agf->agf_roots[btnum] = ptr->s;
61 be32_add_cpu(&agf->agf_levels[btnum], inc);
62 pag->pagf_levels[btnum] += inc;
63 xfs_perag_put(pag);
64
65 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
66}
67
68STATIC int
69xfs_allocbt_alloc_block(
70 struct xfs_btree_cur *cur,
71 union xfs_btree_ptr *start,
72 union xfs_btree_ptr *new,
73 int *stat)
74{
75 int error;
76 xfs_agblock_t bno;
77
78 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
79
80 /* Allocate the new block from the freelist. If we can't, give up. */
81 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
82 &bno, 1);
83 if (error) {
84 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
85 return error;
86 }
87
88 if (bno == NULLAGBLOCK) {
89 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
90 *stat = 0;
91 return 0;
92 }
93
94 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
95
96 xfs_trans_agbtree_delta(cur->bc_tp, 1);
97 new->s = cpu_to_be32(bno);
98
99 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
100 *stat = 1;
101 return 0;
102}
103
104STATIC int
105xfs_allocbt_free_block(
106 struct xfs_btree_cur *cur,
107 struct xfs_buf *bp)
108{
109 struct xfs_buf *agbp = cur->bc_private.a.agbp;
110 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
111 xfs_agblock_t bno;
112 int error;
113
114 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
115 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
116 if (error)
117 return error;
118
119 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
120 XFS_EXTENT_BUSY_SKIP_DISCARD);
121 xfs_trans_agbtree_delta(cur->bc_tp, -1);
122
123 xfs_trans_binval(cur->bc_tp, bp);
124 return 0;
125}
126
127/*
128 * Update the longest extent in the AGF
129 */
130STATIC void
131xfs_allocbt_update_lastrec(
132 struct xfs_btree_cur *cur,
133 struct xfs_btree_block *block,
134 union xfs_btree_rec *rec,
135 int ptr,
136 int reason)
137{
138 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
139 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
140 struct xfs_perag *pag;
141 __be32 len;
142 int numrecs;
143
144 ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
145
146 switch (reason) {
147 case LASTREC_UPDATE:
148 /*
149 * If this is the last leaf block and it's the last record,
150 * then update the size of the longest extent in the AG.
151 */
152 if (ptr != xfs_btree_get_numrecs(block))
153 return;
154 len = rec->alloc.ar_blockcount;
155 break;
156 case LASTREC_INSREC:
157 if (be32_to_cpu(rec->alloc.ar_blockcount) <=
158 be32_to_cpu(agf->agf_longest))
159 return;
160 len = rec->alloc.ar_blockcount;
161 break;
162 case LASTREC_DELREC:
163 numrecs = xfs_btree_get_numrecs(block);
164 if (ptr <= numrecs)
165 return;
166 ASSERT(ptr == numrecs + 1);
167
168 if (numrecs) {
169 xfs_alloc_rec_t *rrp;
170
171 rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
172 len = rrp->ar_blockcount;
173 } else {
174 len = 0;
175 }
176
177 break;
178 default:
179 ASSERT(0);
180 return;
181 }
182
183 agf->agf_longest = len;
184 pag = xfs_perag_get(cur->bc_mp, seqno);
185 pag->pagf_longest = be32_to_cpu(len);
186 xfs_perag_put(pag);
187 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
188}
189
190STATIC int
191xfs_allocbt_get_minrecs(
192 struct xfs_btree_cur *cur,
193 int level)
194{
195 return cur->bc_mp->m_alloc_mnr[level != 0];
196}
197
198STATIC int
199xfs_allocbt_get_maxrecs(
200 struct xfs_btree_cur *cur,
201 int level)
202{
203 return cur->bc_mp->m_alloc_mxr[level != 0];
204}
205
206STATIC void
207xfs_allocbt_init_key_from_rec(
208 union xfs_btree_key *key,
209 union xfs_btree_rec *rec)
210{
211 ASSERT(rec->alloc.ar_startblock != 0);
212
213 key->alloc.ar_startblock = rec->alloc.ar_startblock;
214 key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
215}
216
217STATIC void
218xfs_allocbt_init_rec_from_key(
219 union xfs_btree_key *key,
220 union xfs_btree_rec *rec)
221{
222 ASSERT(key->alloc.ar_startblock != 0);
223
224 rec->alloc.ar_startblock = key->alloc.ar_startblock;
225 rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
226}
227
228STATIC void
229xfs_allocbt_init_rec_from_cur(
230 struct xfs_btree_cur *cur,
231 union xfs_btree_rec *rec)
232{
233 ASSERT(cur->bc_rec.a.ar_startblock != 0);
234
235 rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
236 rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
237}
238
239STATIC void
240xfs_allocbt_init_ptr_from_cur(
241 struct xfs_btree_cur *cur,
242 union xfs_btree_ptr *ptr)
243{
244 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
245
246 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
247 ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
248
249 ptr->s = agf->agf_roots[cur->bc_btnum];
250}
251
252STATIC __int64_t
253xfs_allocbt_key_diff(
254 struct xfs_btree_cur *cur,
255 union xfs_btree_key *key)
256{
257 xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
258 xfs_alloc_key_t *kp = &key->alloc;
259 __int64_t diff;
260
261 if (cur->bc_btnum == XFS_BTNUM_BNO) {
262 return (__int64_t)be32_to_cpu(kp->ar_startblock) -
263 rec->ar_startblock;
264 }
265
266 diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
267 if (diff)
268 return diff;
269
270 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
271}
272
273static bool
274xfs_allocbt_verify(
275 struct xfs_buf *bp)
276{
277 struct xfs_mount *mp = bp->b_target->bt_mount;
278 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
279 struct xfs_perag *pag = bp->b_pag;
280 unsigned int level;
281
282 /*
283 * magic number and level verification
284 *
285 * During growfs operations, we can't verify the exact level or owner as
286 * the perag is not fully initialised and hence not attached to the
287 * buffer. In this case, check against the maximum tree depth.
288 *
289 * Similarly, during log recovery we will have a perag structure
290 * attached, but the agf information will not yet have been initialised
291 * from the on disk AGF. Again, we can only check against maximum limits
292 * in this case.
293 */
294 level = be16_to_cpu(block->bb_level);
295 switch (block->bb_magic) {
296 case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
297 if (!xfs_sb_version_hascrc(&mp->m_sb))
298 return false;
299 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
300 return false;
301 if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
302 return false;
303 if (pag &&
304 be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
305 return false;
306 /* fall through */
307 case cpu_to_be32(XFS_ABTB_MAGIC):
308 if (pag && pag->pagf_init) {
309 if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
310 return false;
311 } else if (level >= mp->m_ag_maxlevels)
312 return false;
313 break;
314 case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
315 if (!xfs_sb_version_hascrc(&mp->m_sb))
316 return false;
317 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
318 return false;
319 if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
320 return false;
321 if (pag &&
322 be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
323 return false;
324 /* fall through */
325 case cpu_to_be32(XFS_ABTC_MAGIC):
326 if (pag && pag->pagf_init) {
327 if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
328 return false;
329 } else if (level >= mp->m_ag_maxlevels)
330 return false;
331 break;
332 default:
333 return false;
334 }
335
336 /* numrecs verification */
337 if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
338 return false;
339
340 /* sibling pointer verification */
341 if (!block->bb_u.s.bb_leftsib ||
342 (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
343 block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
344 return false;
345 if (!block->bb_u.s.bb_rightsib ||
346 (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
347 block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
348 return false;
349
350 return true;
351}
352
353static void
354xfs_allocbt_read_verify(
355 struct xfs_buf *bp)
356{
357 if (!xfs_btree_sblock_verify_crc(bp))
358 xfs_buf_ioerror(bp, -EFSBADCRC);
359 else if (!xfs_allocbt_verify(bp))
360 xfs_buf_ioerror(bp, -EFSCORRUPTED);
361
362 if (bp->b_error) {
363 trace_xfs_btree_corrupt(bp, _RET_IP_);
364 xfs_verifier_error(bp);
365 }
366}
367
368static void
369xfs_allocbt_write_verify(
370 struct xfs_buf *bp)
371{
372 if (!xfs_allocbt_verify(bp)) {
373 trace_xfs_btree_corrupt(bp, _RET_IP_);
374 xfs_buf_ioerror(bp, -EFSCORRUPTED);
375 xfs_verifier_error(bp);
376 return;
377 }
378 xfs_btree_sblock_calc_crc(bp);
379
380}
381
382const struct xfs_buf_ops xfs_allocbt_buf_ops = {
383 .verify_read = xfs_allocbt_read_verify,
384 .verify_write = xfs_allocbt_write_verify,
385};
386
387
388#if defined(DEBUG) || defined(XFS_WARN)
389STATIC int
390xfs_allocbt_keys_inorder(
391 struct xfs_btree_cur *cur,
392 union xfs_btree_key *k1,
393 union xfs_btree_key *k2)
394{
395 if (cur->bc_btnum == XFS_BTNUM_BNO) {
396 return be32_to_cpu(k1->alloc.ar_startblock) <
397 be32_to_cpu(k2->alloc.ar_startblock);
398 } else {
399 return be32_to_cpu(k1->alloc.ar_blockcount) <
400 be32_to_cpu(k2->alloc.ar_blockcount) ||
401 (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
402 be32_to_cpu(k1->alloc.ar_startblock) <
403 be32_to_cpu(k2->alloc.ar_startblock));
404 }
405}
406
407STATIC int
408xfs_allocbt_recs_inorder(
409 struct xfs_btree_cur *cur,
410 union xfs_btree_rec *r1,
411 union xfs_btree_rec *r2)
412{
413 if (cur->bc_btnum == XFS_BTNUM_BNO) {
414 return be32_to_cpu(r1->alloc.ar_startblock) +
415 be32_to_cpu(r1->alloc.ar_blockcount) <=
416 be32_to_cpu(r2->alloc.ar_startblock);
417 } else {
418 return be32_to_cpu(r1->alloc.ar_blockcount) <
419 be32_to_cpu(r2->alloc.ar_blockcount) ||
420 (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
421 be32_to_cpu(r1->alloc.ar_startblock) <
422 be32_to_cpu(r2->alloc.ar_startblock));
423 }
424}
425#endif /* DEBUG */
426
427static const struct xfs_btree_ops xfs_allocbt_ops = {
428 .rec_len = sizeof(xfs_alloc_rec_t),
429 .key_len = sizeof(xfs_alloc_key_t),
430
431 .dup_cursor = xfs_allocbt_dup_cursor,
432 .set_root = xfs_allocbt_set_root,
433 .alloc_block = xfs_allocbt_alloc_block,
434 .free_block = xfs_allocbt_free_block,
435 .update_lastrec = xfs_allocbt_update_lastrec,
436 .get_minrecs = xfs_allocbt_get_minrecs,
437 .get_maxrecs = xfs_allocbt_get_maxrecs,
438 .init_key_from_rec = xfs_allocbt_init_key_from_rec,
439 .init_rec_from_key = xfs_allocbt_init_rec_from_key,
440 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
441 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
442 .key_diff = xfs_allocbt_key_diff,
443 .buf_ops = &xfs_allocbt_buf_ops,
444#if defined(DEBUG) || defined(XFS_WARN)
445 .keys_inorder = xfs_allocbt_keys_inorder,
446 .recs_inorder = xfs_allocbt_recs_inorder,
447#endif
448};
449
450/*
451 * Allocate a new allocation btree cursor.
452 */
453struct xfs_btree_cur * /* new alloc btree cursor */
454xfs_allocbt_init_cursor(
455 struct xfs_mount *mp, /* file system mount point */
456 struct xfs_trans *tp, /* transaction pointer */
457 struct xfs_buf *agbp, /* buffer for agf structure */
458 xfs_agnumber_t agno, /* allocation group number */
459 xfs_btnum_t btnum) /* btree identifier */
460{
461 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
462 struct xfs_btree_cur *cur;
463
464 ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
465
466 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
467
468 cur->bc_tp = tp;
469 cur->bc_mp = mp;
470 cur->bc_btnum = btnum;
471 cur->bc_blocklog = mp->m_sb.sb_blocklog;
472 cur->bc_ops = &xfs_allocbt_ops;
473
474 if (btnum == XFS_BTNUM_CNT) {
475 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
476 cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
477 } else {
478 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
479 }
480
481 cur->bc_private.a.agbp = agbp;
482 cur->bc_private.a.agno = agno;
483
484 if (xfs_sb_version_hascrc(&mp->m_sb))
485 cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
486
487 return cur;
488}
489
490/*
491 * Calculate number of records in an alloc btree block.
492 */
493int
494xfs_allocbt_maxrecs(
495 struct xfs_mount *mp,
496 int blocklen,
497 int leaf)
498{
499 blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
500
501 if (leaf)
502 return blocklen / sizeof(xfs_alloc_rec_t);
503 return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
504}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
new file mode 100644
index 000000000000..45e189e7e81c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -0,0 +1,65 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_ALLOC_BTREE_H__
19#define __XFS_ALLOC_BTREE_H__
20
21/*
22 * Freespace on-disk structures
23 */
24
25struct xfs_buf;
26struct xfs_btree_cur;
27struct xfs_mount;
28
29/*
30 * Btree block header size depends on a superblock flag.
31 */
32#define XFS_ALLOC_BLOCK_LEN(mp) \
33 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
34 XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
35
36/*
37 * Record, key, and pointer address macros for btree blocks.
38 *
39 * (note that some of these may appear unused, but they are used in userspace)
40 */
41#define XFS_ALLOC_REC_ADDR(mp, block, index) \
42 ((xfs_alloc_rec_t *) \
43 ((char *)(block) + \
44 XFS_ALLOC_BLOCK_LEN(mp) + \
45 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
46
47#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
48 ((xfs_alloc_key_t *) \
49 ((char *)(block) + \
50 XFS_ALLOC_BLOCK_LEN(mp) + \
51 ((index) - 1) * sizeof(xfs_alloc_key_t)))
52
53#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
54 ((xfs_alloc_ptr_t *) \
55 ((char *)(block) + \
56 XFS_ALLOC_BLOCK_LEN(mp) + \
57 (maxrecs) * sizeof(xfs_alloc_key_t) + \
58 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
59
60extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
61 struct xfs_trans *, struct xfs_buf *,
62 xfs_agnumber_t, xfs_btnum_t);
63extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
64
65#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
new file mode 100644
index 000000000000..353fb425faef
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -0,0 +1,1459 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_da_format.h"
29#include "xfs_da_btree.h"
30#include "xfs_attr_sf.h"
31#include "xfs_inode.h"
32#include "xfs_alloc.h"
33#include "xfs_trans.h"
34#include "xfs_inode_item.h"
35#include "xfs_bmap.h"
36#include "xfs_bmap_util.h"
37#include "xfs_bmap_btree.h"
38#include "xfs_attr.h"
39#include "xfs_attr_leaf.h"
40#include "xfs_attr_remote.h"
41#include "xfs_error.h"
42#include "xfs_quota.h"
43#include "xfs_trans_space.h"
44#include "xfs_trace.h"
45#include "xfs_dinode.h"
46
47/*
48 * xfs_attr.c
49 *
50 * Provide the external interfaces to manage attribute lists.
51 */
52
53/*========================================================================
54 * Function prototypes for the kernel.
55 *========================================================================*/
56
57/*
58 * Internal routines when attribute list fits inside the inode.
59 */
60STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
61
62/*
63 * Internal routines when attribute list is one block.
64 */
65STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
66STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
67STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
68
69/*
70 * Internal routines when attribute list is more than one block.
71 */
72STATIC int xfs_attr_node_get(xfs_da_args_t *args);
73STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
74STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
75STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
76STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
77
78
79STATIC int
80xfs_attr_args_init(
81 struct xfs_da_args *args,
82 struct xfs_inode *dp,
83 const unsigned char *name,
84 int flags)
85{
86
87 if (!name)
88 return -EINVAL;
89
90 memset(args, 0, sizeof(*args));
91 args->geo = dp->i_mount->m_attr_geo;
92 args->whichfork = XFS_ATTR_FORK;
93 args->dp = dp;
94 args->flags = flags;
95 args->name = name;
96 args->namelen = strlen((const char *)name);
97 if (args->namelen >= MAXNAMELEN)
98 return -EFAULT; /* match IRIX behaviour */
99
100 args->hashval = xfs_da_hashname(args->name, args->namelen);
101 return 0;
102}
103
104int
105xfs_inode_hasattr(
106 struct xfs_inode *ip)
107{
108 if (!XFS_IFORK_Q(ip) ||
109 (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
110 ip->i_d.di_anextents == 0))
111 return 0;
112 return 1;
113}
114
115/*========================================================================
116 * Overall external interface routines.
117 *========================================================================*/
118
119int
120xfs_attr_get(
121 struct xfs_inode *ip,
122 const unsigned char *name,
123 unsigned char *value,
124 int *valuelenp,
125 int flags)
126{
127 struct xfs_da_args args;
128 uint lock_mode;
129 int error;
130
131 XFS_STATS_INC(xs_attr_get);
132
133 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
134 return -EIO;
135
136 if (!xfs_inode_hasattr(ip))
137 return -ENOATTR;
138
139 error = xfs_attr_args_init(&args, ip, name, flags);
140 if (error)
141 return error;
142
143 args.value = value;
144 args.valuelen = *valuelenp;
145
146 lock_mode = xfs_ilock_attr_map_shared(ip);
147 if (!xfs_inode_hasattr(ip))
148 error = -ENOATTR;
149 else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
150 error = xfs_attr_shortform_getvalue(&args);
151 else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
152 error = xfs_attr_leaf_get(&args);
153 else
154 error = xfs_attr_node_get(&args);
155 xfs_iunlock(ip, lock_mode);
156
157 *valuelenp = args.valuelen;
158 return error == -EEXIST ? 0 : error;
159}
160
161/*
162 * Calculate how many blocks we need for the new attribute,
163 */
164STATIC int
165xfs_attr_calc_size(
166 struct xfs_da_args *args,
167 int *local)
168{
169 struct xfs_mount *mp = args->dp->i_mount;
170 int size;
171 int nblks;
172
173 /*
174 * Determine space new attribute will use, and if it would be
175 * "local" or "remote" (note: local != inline).
176 */
177 size = xfs_attr_leaf_newentsize(args, local);
178 nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
179 if (*local) {
180 if (size > (args->geo->blksize / 2)) {
181 /* Double split possible */
182 nblks *= 2;
183 }
184 } else {
185 /*
186 * Out of line attribute, cannot double split, but
187 * make room for the attribute value itself.
188 */
189 uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
190 nblks += dblocks;
191 nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
192 }
193
194 return nblks;
195}
196
197int
198xfs_attr_set(
199 struct xfs_inode *dp,
200 const unsigned char *name,
201 unsigned char *value,
202 int valuelen,
203 int flags)
204{
205 struct xfs_mount *mp = dp->i_mount;
206 struct xfs_da_args args;
207 struct xfs_bmap_free flist;
208 struct xfs_trans_res tres;
209 xfs_fsblock_t firstblock;
210 int rsvd = (flags & ATTR_ROOT) != 0;
211 int error, err2, committed, local;
212
213 XFS_STATS_INC(xs_attr_set);
214
215 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
216 return -EIO;
217
218 error = xfs_attr_args_init(&args, dp, name, flags);
219 if (error)
220 return error;
221
222 args.value = value;
223 args.valuelen = valuelen;
224 args.firstblock = &firstblock;
225 args.flist = &flist;
226 args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
227 args.total = xfs_attr_calc_size(&args, &local);
228
229 error = xfs_qm_dqattach(dp, 0);
230 if (error)
231 return error;
232
233 /*
234 * If the inode doesn't have an attribute fork, add one.
235 * (inode must not be locked when we call this routine)
236 */
237 if (XFS_IFORK_Q(dp) == 0) {
238 int sf_size = sizeof(xfs_attr_sf_hdr_t) +
239 XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
240
241 error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
242 if (error)
243 return error;
244 }
245
246 /*
247 * Start our first transaction of the day.
248 *
249 * All future transactions during this code must be "chained" off
250 * this one via the trans_dup() call. All transactions will contain
251 * the inode, and the inode will always be marked with trans_ihold().
252 * Since the inode will be locked in all transactions, we must log
253 * the inode in every transaction to let it float upward through
254 * the log.
255 */
256 args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
257
258 /*
259 * Root fork attributes can use reserved data blocks for this
260 * operation if necessary
261 */
262
263 if (rsvd)
264 args.trans->t_flags |= XFS_TRANS_RESERVE;
265
266 tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
267 M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
268 tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
269 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
270 error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
271 if (error) {
272 xfs_trans_cancel(args.trans, 0);
273 return error;
274 }
275 xfs_ilock(dp, XFS_ILOCK_EXCL);
276
277 error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
278 rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
279 XFS_QMOPT_RES_REGBLKS);
280 if (error) {
281 xfs_iunlock(dp, XFS_ILOCK_EXCL);
282 xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
283 return error;
284 }
285
286 xfs_trans_ijoin(args.trans, dp, 0);
287
288 /*
289 * If the attribute list is non-existent or a shortform list,
290 * upgrade it to a single-leaf-block attribute list.
291 */
292 if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
293 (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
294 dp->i_d.di_anextents == 0)) {
295
296 /*
297 * Build initial attribute list (if required).
298 */
299 if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
300 xfs_attr_shortform_create(&args);
301
302 /*
303 * Try to add the attr to the attribute list in
304 * the inode.
305 */
306 error = xfs_attr_shortform_addname(&args);
307 if (error != -ENOSPC) {
308 /*
309 * Commit the shortform mods, and we're done.
310 * NOTE: this is also the error path (EEXIST, etc).
311 */
312 ASSERT(args.trans != NULL);
313
314 /*
315 * If this is a synchronous mount, make sure that
316 * the transaction goes to disk before returning
317 * to the user.
318 */
319 if (mp->m_flags & XFS_MOUNT_WSYNC)
320 xfs_trans_set_sync(args.trans);
321
322 if (!error && (flags & ATTR_KERNOTIME) == 0) {
323 xfs_trans_ichgtime(args.trans, dp,
324 XFS_ICHGTIME_CHG);
325 }
326 err2 = xfs_trans_commit(args.trans,
327 XFS_TRANS_RELEASE_LOG_RES);
328 xfs_iunlock(dp, XFS_ILOCK_EXCL);
329
330 return error ? error : err2;
331 }
332
333 /*
334 * It won't fit in the shortform, transform to a leaf block.
335 * GROT: another possible req'mt for a double-split btree op.
336 */
337 xfs_bmap_init(args.flist, args.firstblock);
338 error = xfs_attr_shortform_to_leaf(&args);
339 if (!error) {
340 error = xfs_bmap_finish(&args.trans, args.flist,
341 &committed);
342 }
343 if (error) {
344 ASSERT(committed);
345 args.trans = NULL;
346 xfs_bmap_cancel(&flist);
347 goto out;
348 }
349
350 /*
351 * bmap_finish() may have committed the last trans and started
352 * a new one. We need the inode to be in all transactions.
353 */
354 if (committed)
355 xfs_trans_ijoin(args.trans, dp, 0);
356
357 /*
358 * Commit the leaf transformation. We'll need another (linked)
359 * transaction to add the new attribute to the leaf.
360 */
361
362 error = xfs_trans_roll(&args.trans, dp);
363 if (error)
364 goto out;
365
366 }
367
368 if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
369 error = xfs_attr_leaf_addname(&args);
370 else
371 error = xfs_attr_node_addname(&args);
372 if (error)
373 goto out;
374
375 /*
376 * If this is a synchronous mount, make sure that the
377 * transaction goes to disk before returning to the user.
378 */
379 if (mp->m_flags & XFS_MOUNT_WSYNC)
380 xfs_trans_set_sync(args.trans);
381
382 if ((flags & ATTR_KERNOTIME) == 0)
383 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
384
385 /*
386 * Commit the last in the sequence of transactions.
387 */
388 xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
389 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
390 xfs_iunlock(dp, XFS_ILOCK_EXCL);
391
392 return error;
393
394out:
395 if (args.trans) {
396 xfs_trans_cancel(args.trans,
397 XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
398 }
399 xfs_iunlock(dp, XFS_ILOCK_EXCL);
400 return error;
401}
402
403/*
404 * Generic handler routine to remove a name from an attribute list.
405 * Transitions attribute list from Btree to shortform as necessary.
406 */
407int
408xfs_attr_remove(
409 struct xfs_inode *dp,
410 const unsigned char *name,
411 int flags)
412{
413 struct xfs_mount *mp = dp->i_mount;
414 struct xfs_da_args args;
415 struct xfs_bmap_free flist;
416 xfs_fsblock_t firstblock;
417 int error;
418
419 XFS_STATS_INC(xs_attr_remove);
420
421 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
422 return -EIO;
423
424 if (!xfs_inode_hasattr(dp))
425 return -ENOATTR;
426
427 error = xfs_attr_args_init(&args, dp, name, flags);
428 if (error)
429 return error;
430
431 args.firstblock = &firstblock;
432 args.flist = &flist;
433
434 /*
435 * we have no control over the attribute names that userspace passes us
436 * to remove, so we have to allow the name lookup prior to attribute
437 * removal to fail.
438 */
439 args.op_flags = XFS_DA_OP_OKNOENT;
440
441 error = xfs_qm_dqattach(dp, 0);
442 if (error)
443 return error;
444
445 /*
446 * Start our first transaction of the day.
447 *
448 * All future transactions during this code must be "chained" off
449 * this one via the trans_dup() call. All transactions will contain
450 * the inode, and the inode will always be marked with trans_ihold().
451 * Since the inode will be locked in all transactions, we must log
452 * the inode in every transaction to let it float upward through
453 * the log.
454 */
455 args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
456
457 /*
458 * Root fork attributes can use reserved data blocks for this
459 * operation if necessary
460 */
461
462 if (flags & ATTR_ROOT)
463 args.trans->t_flags |= XFS_TRANS_RESERVE;
464
465 error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
466 XFS_ATTRRM_SPACE_RES(mp), 0);
467 if (error) {
468 xfs_trans_cancel(args.trans, 0);
469 return error;
470 }
471
472 xfs_ilock(dp, XFS_ILOCK_EXCL);
473 /*
474 * No need to make quota reservations here. We expect to release some
475 * blocks not allocate in the common case.
476 */
477 xfs_trans_ijoin(args.trans, dp, 0);
478
479 if (!xfs_inode_hasattr(dp)) {
480 error = -ENOATTR;
481 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
482 ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
483 error = xfs_attr_shortform_remove(&args);
484 } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
485 error = xfs_attr_leaf_removename(&args);
486 } else {
487 error = xfs_attr_node_removename(&args);
488 }
489
490 if (error)
491 goto out;
492
493 /*
494 * If this is a synchronous mount, make sure that the
495 * transaction goes to disk before returning to the user.
496 */
497 if (mp->m_flags & XFS_MOUNT_WSYNC)
498 xfs_trans_set_sync(args.trans);
499
500 if ((flags & ATTR_KERNOTIME) == 0)
501 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
502
503 /*
504 * Commit the last in the sequence of transactions.
505 */
506 xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
507 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
508 xfs_iunlock(dp, XFS_ILOCK_EXCL);
509
510 return error;
511
512out:
513 if (args.trans) {
514 xfs_trans_cancel(args.trans,
515 XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
516 }
517 xfs_iunlock(dp, XFS_ILOCK_EXCL);
518 return error;
519}
520
521/*========================================================================
522 * External routines when attribute list is inside the inode
523 *========================================================================*/
524
525/*
526 * Add a name to the shortform attribute list structure
527 * This is the external routine.
528 */
529STATIC int
530xfs_attr_shortform_addname(xfs_da_args_t *args)
531{
532 int newsize, forkoff, retval;
533
534 trace_xfs_attr_sf_addname(args);
535
536 retval = xfs_attr_shortform_lookup(args);
537 if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
538 return retval;
539 } else if (retval == -EEXIST) {
540 if (args->flags & ATTR_CREATE)
541 return retval;
542 retval = xfs_attr_shortform_remove(args);
543 ASSERT(retval == 0);
544 }
545
546 if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
547 args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
548 return -ENOSPC;
549
550 newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
551 newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
552
553 forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
554 if (!forkoff)
555 return -ENOSPC;
556
557 xfs_attr_shortform_add(args, forkoff);
558 return 0;
559}
560
561
562/*========================================================================
563 * External routines when attribute list is one block
564 *========================================================================*/
565
566/*
567 * Add a name to the leaf attribute list structure
568 *
569 * This leaf block cannot have a "remote" value, we only call this routine
570 * if bmap_one_block() says there is only one block (ie: no remote blks).
571 */
572STATIC int
573xfs_attr_leaf_addname(xfs_da_args_t *args)
574{
575 xfs_inode_t *dp;
576 struct xfs_buf *bp;
577 int retval, error, committed, forkoff;
578
579 trace_xfs_attr_leaf_addname(args);
580
581 /*
582 * Read the (only) block in the attribute list in.
583 */
584 dp = args->dp;
585 args->blkno = 0;
586 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
587 if (error)
588 return error;
589
590 /*
591 * Look up the given attribute in the leaf block. Figure out if
592 * the given flags produce an error or call for an atomic rename.
593 */
594 retval = xfs_attr3_leaf_lookup_int(bp, args);
595 if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
596 xfs_trans_brelse(args->trans, bp);
597 return retval;
598 } else if (retval == -EEXIST) {
599 if (args->flags & ATTR_CREATE) { /* pure create op */
600 xfs_trans_brelse(args->trans, bp);
601 return retval;
602 }
603
604 trace_xfs_attr_leaf_replace(args);
605
606 /* save the attribute state for later removal*/
607 args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
608 args->blkno2 = args->blkno; /* set 2nd entry info*/
609 args->index2 = args->index;
610 args->rmtblkno2 = args->rmtblkno;
611 args->rmtblkcnt2 = args->rmtblkcnt;
612 args->rmtvaluelen2 = args->rmtvaluelen;
613
614 /*
615 * clear the remote attr state now that it is saved so that the
616 * values reflect the state of the attribute we are about to
617 * add, not the attribute we just found and will remove later.
618 */
619 args->rmtblkno = 0;
620 args->rmtblkcnt = 0;
621 args->rmtvaluelen = 0;
622 }
623
624 /*
625 * Add the attribute to the leaf block, transitioning to a Btree
626 * if required.
627 */
628 retval = xfs_attr3_leaf_add(bp, args);
629 if (retval == -ENOSPC) {
630 /*
631 * Promote the attribute list to the Btree format, then
632 * Commit that transaction so that the node_addname() call
633 * can manage its own transactions.
634 */
635 xfs_bmap_init(args->flist, args->firstblock);
636 error = xfs_attr3_leaf_to_node(args);
637 if (!error) {
638 error = xfs_bmap_finish(&args->trans, args->flist,
639 &committed);
640 }
641 if (error) {
642 ASSERT(committed);
643 args->trans = NULL;
644 xfs_bmap_cancel(args->flist);
645 return error;
646 }
647
648 /*
649 * bmap_finish() may have committed the last trans and started
650 * a new one. We need the inode to be in all transactions.
651 */
652 if (committed)
653 xfs_trans_ijoin(args->trans, dp, 0);
654
655 /*
656 * Commit the current trans (including the inode) and start
657 * a new one.
658 */
659 error = xfs_trans_roll(&args->trans, dp);
660 if (error)
661 return error;
662
663 /*
664 * Fob the whole rest of the problem off on the Btree code.
665 */
666 error = xfs_attr_node_addname(args);
667 return error;
668 }
669
670 /*
671 * Commit the transaction that added the attr name so that
672 * later routines can manage their own transactions.
673 */
674 error = xfs_trans_roll(&args->trans, dp);
675 if (error)
676 return error;
677
678 /*
679 * If there was an out-of-line value, allocate the blocks we
680 * identified for its storage and copy the value. This is done
681 * after we create the attribute so that we don't overflow the
682 * maximum size of a transaction and/or hit a deadlock.
683 */
684 if (args->rmtblkno > 0) {
685 error = xfs_attr_rmtval_set(args);
686 if (error)
687 return error;
688 }
689
690 /*
691 * If this is an atomic rename operation, we must "flip" the
692 * incomplete flags on the "new" and "old" attribute/value pairs
693 * so that one disappears and one appears atomically. Then we
694 * must remove the "old" attribute/value pair.
695 */
696 if (args->op_flags & XFS_DA_OP_RENAME) {
697 /*
698 * In a separate transaction, set the incomplete flag on the
699 * "old" attr and clear the incomplete flag on the "new" attr.
700 */
701 error = xfs_attr3_leaf_flipflags(args);
702 if (error)
703 return error;
704
705 /*
706 * Dismantle the "old" attribute/value pair by removing
707 * a "remote" value (if it exists).
708 */
709 args->index = args->index2;
710 args->blkno = args->blkno2;
711 args->rmtblkno = args->rmtblkno2;
712 args->rmtblkcnt = args->rmtblkcnt2;
713 args->rmtvaluelen = args->rmtvaluelen2;
714 if (args->rmtblkno) {
715 error = xfs_attr_rmtval_remove(args);
716 if (error)
717 return error;
718 }
719
720 /*
721 * Read in the block containing the "old" attr, then
722 * remove the "old" attr from that block (neat, huh!)
723 */
724 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
725 -1, &bp);
726 if (error)
727 return error;
728
729 xfs_attr3_leaf_remove(bp, args);
730
731 /*
732 * If the result is small enough, shrink it all into the inode.
733 */
734 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
735 xfs_bmap_init(args->flist, args->firstblock);
736 error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
737 /* bp is gone due to xfs_da_shrink_inode */
738 if (!error) {
739 error = xfs_bmap_finish(&args->trans,
740 args->flist,
741 &committed);
742 }
743 if (error) {
744 ASSERT(committed);
745 args->trans = NULL;
746 xfs_bmap_cancel(args->flist);
747 return error;
748 }
749
750 /*
751 * bmap_finish() may have committed the last trans
752 * and started a new one. We need the inode to be
753 * in all transactions.
754 */
755 if (committed)
756 xfs_trans_ijoin(args->trans, dp, 0);
757 }
758
759 /*
760 * Commit the remove and start the next trans in series.
761 */
762 error = xfs_trans_roll(&args->trans, dp);
763
764 } else if (args->rmtblkno > 0) {
765 /*
766 * Added a "remote" value, just clear the incomplete flag.
767 */
768 error = xfs_attr3_leaf_clearflag(args);
769 }
770 return error;
771}
772
773/*
774 * Remove a name from the leaf attribute list structure
775 *
776 * This leaf block cannot have a "remote" value, we only call this routine
777 * if bmap_one_block() says there is only one block (ie: no remote blks).
778 */
779STATIC int
780xfs_attr_leaf_removename(xfs_da_args_t *args)
781{
782 xfs_inode_t *dp;
783 struct xfs_buf *bp;
784 int error, committed, forkoff;
785
786 trace_xfs_attr_leaf_removename(args);
787
788 /*
789 * Remove the attribute.
790 */
791 dp = args->dp;
792 args->blkno = 0;
793 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
794 if (error)
795 return error;
796
797 error = xfs_attr3_leaf_lookup_int(bp, args);
798 if (error == -ENOATTR) {
799 xfs_trans_brelse(args->trans, bp);
800 return error;
801 }
802
803 xfs_attr3_leaf_remove(bp, args);
804
805 /*
806 * If the result is small enough, shrink it all into the inode.
807 */
808 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
809 xfs_bmap_init(args->flist, args->firstblock);
810 error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
811 /* bp is gone due to xfs_da_shrink_inode */
812 if (!error) {
813 error = xfs_bmap_finish(&args->trans, args->flist,
814 &committed);
815 }
816 if (error) {
817 ASSERT(committed);
818 args->trans = NULL;
819 xfs_bmap_cancel(args->flist);
820 return error;
821 }
822
823 /*
824 * bmap_finish() may have committed the last trans and started
825 * a new one. We need the inode to be in all transactions.
826 */
827 if (committed)
828 xfs_trans_ijoin(args->trans, dp, 0);
829 }
830 return 0;
831}
832
833/*
834 * Look up a name in a leaf attribute list structure.
835 *
836 * This leaf block cannot have a "remote" value, we only call this routine
837 * if bmap_one_block() says there is only one block (ie: no remote blks).
838 */
839STATIC int
840xfs_attr_leaf_get(xfs_da_args_t *args)
841{
842 struct xfs_buf *bp;
843 int error;
844
845 trace_xfs_attr_leaf_get(args);
846
847 args->blkno = 0;
848 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
849 if (error)
850 return error;
851
852 error = xfs_attr3_leaf_lookup_int(bp, args);
853 if (error != -EEXIST) {
854 xfs_trans_brelse(args->trans, bp);
855 return error;
856 }
857 error = xfs_attr3_leaf_getvalue(bp, args);
858 xfs_trans_brelse(args->trans, bp);
859 if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
860 error = xfs_attr_rmtval_get(args);
861 }
862 return error;
863}
864
865/*========================================================================
866 * External routines when attribute list size > geo->blksize
867 *========================================================================*/
868
869/*
870 * Add a name to a Btree-format attribute list.
871 *
872 * This will involve walking down the Btree, and may involve splitting
873 * leaf nodes and even splitting intermediate nodes up to and including
874 * the root node (a special case of an intermediate node).
875 *
876 * "Remote" attribute values confuse the issue and atomic rename operations
877 * add a whole extra layer of confusion on top of that.
878 */
879STATIC int
880xfs_attr_node_addname(xfs_da_args_t *args)
881{
882 xfs_da_state_t *state;
883 xfs_da_state_blk_t *blk;
884 xfs_inode_t *dp;
885 xfs_mount_t *mp;
886 int committed, retval, error;
887
888 trace_xfs_attr_node_addname(args);
889
890 /*
891 * Fill in bucket of arguments/results/context to carry around.
892 */
893 dp = args->dp;
894 mp = dp->i_mount;
895restart:
896 state = xfs_da_state_alloc();
897 state->args = args;
898 state->mp = mp;
899
900 /*
901 * Search to see if name already exists, and get back a pointer
902 * to where it should go.
903 */
904 error = xfs_da3_node_lookup_int(state, &retval);
905 if (error)
906 goto out;
907 blk = &state->path.blk[ state->path.active-1 ];
908 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
909 if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
910 goto out;
911 } else if (retval == -EEXIST) {
912 if (args->flags & ATTR_CREATE)
913 goto out;
914
915 trace_xfs_attr_node_replace(args);
916
917 /* save the attribute state for later removal*/
918 args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
919 args->blkno2 = args->blkno; /* set 2nd entry info*/
920 args->index2 = args->index;
921 args->rmtblkno2 = args->rmtblkno;
922 args->rmtblkcnt2 = args->rmtblkcnt;
923 args->rmtvaluelen2 = args->rmtvaluelen;
924
925 /*
926 * clear the remote attr state now that it is saved so that the
927 * values reflect the state of the attribute we are about to
928 * add, not the attribute we just found and will remove later.
929 */
930 args->rmtblkno = 0;
931 args->rmtblkcnt = 0;
932 args->rmtvaluelen = 0;
933 }
934
935 retval = xfs_attr3_leaf_add(blk->bp, state->args);
936 if (retval == -ENOSPC) {
937 if (state->path.active == 1) {
938 /*
939 * Its really a single leaf node, but it had
940 * out-of-line values so it looked like it *might*
941 * have been a b-tree.
942 */
943 xfs_da_state_free(state);
944 state = NULL;
945 xfs_bmap_init(args->flist, args->firstblock);
946 error = xfs_attr3_leaf_to_node(args);
947 if (!error) {
948 error = xfs_bmap_finish(&args->trans,
949 args->flist,
950 &committed);
951 }
952 if (error) {
953 ASSERT(committed);
954 args->trans = NULL;
955 xfs_bmap_cancel(args->flist);
956 goto out;
957 }
958
959 /*
960 * bmap_finish() may have committed the last trans
961 * and started a new one. We need the inode to be
962 * in all transactions.
963 */
964 if (committed)
965 xfs_trans_ijoin(args->trans, dp, 0);
966
967 /*
968 * Commit the node conversion and start the next
969 * trans in the chain.
970 */
971 error = xfs_trans_roll(&args->trans, dp);
972 if (error)
973 goto out;
974
975 goto restart;
976 }
977
978 /*
979 * Split as many Btree elements as required.
980 * This code tracks the new and old attr's location
981 * in the index/blkno/rmtblkno/rmtblkcnt fields and
982 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
983 */
984 xfs_bmap_init(args->flist, args->firstblock);
985 error = xfs_da3_split(state);
986 if (!error) {
987 error = xfs_bmap_finish(&args->trans, args->flist,
988 &committed);
989 }
990 if (error) {
991 ASSERT(committed);
992 args->trans = NULL;
993 xfs_bmap_cancel(args->flist);
994 goto out;
995 }
996
997 /*
998 * bmap_finish() may have committed the last trans and started
999 * a new one. We need the inode to be in all transactions.
1000 */
1001 if (committed)
1002 xfs_trans_ijoin(args->trans, dp, 0);
1003 } else {
1004 /*
1005 * Addition succeeded, update Btree hashvals.
1006 */
1007 xfs_da3_fixhashpath(state, &state->path);
1008 }
1009
1010 /*
1011 * Kill the state structure, we're done with it and need to
1012 * allow the buffers to come back later.
1013 */
1014 xfs_da_state_free(state);
1015 state = NULL;
1016
1017 /*
1018 * Commit the leaf addition or btree split and start the next
1019 * trans in the chain.
1020 */
1021 error = xfs_trans_roll(&args->trans, dp);
1022 if (error)
1023 goto out;
1024
1025 /*
1026 * If there was an out-of-line value, allocate the blocks we
1027 * identified for its storage and copy the value. This is done
1028 * after we create the attribute so that we don't overflow the
1029 * maximum size of a transaction and/or hit a deadlock.
1030 */
1031 if (args->rmtblkno > 0) {
1032 error = xfs_attr_rmtval_set(args);
1033 if (error)
1034 return error;
1035 }
1036
1037 /*
1038 * If this is an atomic rename operation, we must "flip" the
1039 * incomplete flags on the "new" and "old" attribute/value pairs
1040 * so that one disappears and one appears atomically. Then we
1041 * must remove the "old" attribute/value pair.
1042 */
1043 if (args->op_flags & XFS_DA_OP_RENAME) {
1044 /*
1045 * In a separate transaction, set the incomplete flag on the
1046 * "old" attr and clear the incomplete flag on the "new" attr.
1047 */
1048 error = xfs_attr3_leaf_flipflags(args);
1049 if (error)
1050 goto out;
1051
1052 /*
1053 * Dismantle the "old" attribute/value pair by removing
1054 * a "remote" value (if it exists).
1055 */
1056 args->index = args->index2;
1057 args->blkno = args->blkno2;
1058 args->rmtblkno = args->rmtblkno2;
1059 args->rmtblkcnt = args->rmtblkcnt2;
1060 args->rmtvaluelen = args->rmtvaluelen2;
1061 if (args->rmtblkno) {
1062 error = xfs_attr_rmtval_remove(args);
1063 if (error)
1064 return error;
1065 }
1066
1067 /*
1068 * Re-find the "old" attribute entry after any split ops.
1069 * The INCOMPLETE flag means that we will find the "old"
1070 * attr, not the "new" one.
1071 */
1072 args->flags |= XFS_ATTR_INCOMPLETE;
1073 state = xfs_da_state_alloc();
1074 state->args = args;
1075 state->mp = mp;
1076 state->inleaf = 0;
1077 error = xfs_da3_node_lookup_int(state, &retval);
1078 if (error)
1079 goto out;
1080
1081 /*
1082 * Remove the name and update the hashvals in the tree.
1083 */
1084 blk = &state->path.blk[ state->path.active-1 ];
1085 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1086 error = xfs_attr3_leaf_remove(blk->bp, args);
1087 xfs_da3_fixhashpath(state, &state->path);
1088
1089 /*
1090 * Check to see if the tree needs to be collapsed.
1091 */
1092 if (retval && (state->path.active > 1)) {
1093 xfs_bmap_init(args->flist, args->firstblock);
1094 error = xfs_da3_join(state);
1095 if (!error) {
1096 error = xfs_bmap_finish(&args->trans,
1097 args->flist,
1098 &committed);
1099 }
1100 if (error) {
1101 ASSERT(committed);
1102 args->trans = NULL;
1103 xfs_bmap_cancel(args->flist);
1104 goto out;
1105 }
1106
1107 /*
1108 * bmap_finish() may have committed the last trans
1109 * and started a new one. We need the inode to be
1110 * in all transactions.
1111 */
1112 if (committed)
1113 xfs_trans_ijoin(args->trans, dp, 0);
1114 }
1115
1116 /*
1117 * Commit and start the next trans in the chain.
1118 */
1119 error = xfs_trans_roll(&args->trans, dp);
1120 if (error)
1121 goto out;
1122
1123 } else if (args->rmtblkno > 0) {
1124 /*
1125 * Added a "remote" value, just clear the incomplete flag.
1126 */
1127 error = xfs_attr3_leaf_clearflag(args);
1128 if (error)
1129 goto out;
1130 }
1131 retval = error = 0;
1132
1133out:
1134 if (state)
1135 xfs_da_state_free(state);
1136 if (error)
1137 return error;
1138 return retval;
1139}
1140
1141/*
1142 * Remove a name from a B-tree attribute list.
1143 *
1144 * This will involve walking down the Btree, and may involve joining
1145 * leaf nodes and even joining intermediate nodes up to and including
1146 * the root node (a special case of an intermediate node).
1147 */
1148STATIC int
1149xfs_attr_node_removename(xfs_da_args_t *args)
1150{
1151 xfs_da_state_t *state;
1152 xfs_da_state_blk_t *blk;
1153 xfs_inode_t *dp;
1154 struct xfs_buf *bp;
1155 int retval, error, committed, forkoff;
1156
1157 trace_xfs_attr_node_removename(args);
1158
1159 /*
1160 * Tie a string around our finger to remind us where we are.
1161 */
1162 dp = args->dp;
1163 state = xfs_da_state_alloc();
1164 state->args = args;
1165 state->mp = dp->i_mount;
1166
1167 /*
1168 * Search to see if name exists, and get back a pointer to it.
1169 */
1170 error = xfs_da3_node_lookup_int(state, &retval);
1171 if (error || (retval != -EEXIST)) {
1172 if (error == 0)
1173 error = retval;
1174 goto out;
1175 }
1176
1177 /*
1178 * If there is an out-of-line value, de-allocate the blocks.
1179 * This is done before we remove the attribute so that we don't
1180 * overflow the maximum size of a transaction and/or hit a deadlock.
1181 */
1182 blk = &state->path.blk[ state->path.active-1 ];
1183 ASSERT(blk->bp != NULL);
1184 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1185 if (args->rmtblkno > 0) {
1186 /*
1187 * Fill in disk block numbers in the state structure
1188 * so that we can get the buffers back after we commit
1189 * several transactions in the following calls.
1190 */
1191 error = xfs_attr_fillstate(state);
1192 if (error)
1193 goto out;
1194
1195 /*
1196 * Mark the attribute as INCOMPLETE, then bunmapi() the
1197 * remote value.
1198 */
1199 error = xfs_attr3_leaf_setflag(args);
1200 if (error)
1201 goto out;
1202 error = xfs_attr_rmtval_remove(args);
1203 if (error)
1204 goto out;
1205
1206 /*
1207 * Refill the state structure with buffers, the prior calls
1208 * released our buffers.
1209 */
1210 error = xfs_attr_refillstate(state);
1211 if (error)
1212 goto out;
1213 }
1214
1215 /*
1216 * Remove the name and update the hashvals in the tree.
1217 */
1218 blk = &state->path.blk[ state->path.active-1 ];
1219 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1220 retval = xfs_attr3_leaf_remove(blk->bp, args);
1221 xfs_da3_fixhashpath(state, &state->path);
1222
1223 /*
1224 * Check to see if the tree needs to be collapsed.
1225 */
1226 if (retval && (state->path.active > 1)) {
1227 xfs_bmap_init(args->flist, args->firstblock);
1228 error = xfs_da3_join(state);
1229 if (!error) {
1230 error = xfs_bmap_finish(&args->trans, args->flist,
1231 &committed);
1232 }
1233 if (error) {
1234 ASSERT(committed);
1235 args->trans = NULL;
1236 xfs_bmap_cancel(args->flist);
1237 goto out;
1238 }
1239
1240 /*
1241 * bmap_finish() may have committed the last trans and started
1242 * a new one. We need the inode to be in all transactions.
1243 */
1244 if (committed)
1245 xfs_trans_ijoin(args->trans, dp, 0);
1246
1247 /*
1248 * Commit the Btree join operation and start a new trans.
1249 */
1250 error = xfs_trans_roll(&args->trans, dp);
1251 if (error)
1252 goto out;
1253 }
1254
1255 /*
1256 * If the result is small enough, push it all into the inode.
1257 */
1258 if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
1259 /*
1260 * Have to get rid of the copy of this dabuf in the state.
1261 */
1262 ASSERT(state->path.active == 1);
1263 ASSERT(state->path.blk[0].bp);
1264 state->path.blk[0].bp = NULL;
1265
1266 error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
1267 if (error)
1268 goto out;
1269
1270 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1271 xfs_bmap_init(args->flist, args->firstblock);
1272 error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
1273 /* bp is gone due to xfs_da_shrink_inode */
1274 if (!error) {
1275 error = xfs_bmap_finish(&args->trans,
1276 args->flist,
1277 &committed);
1278 }
1279 if (error) {
1280 ASSERT(committed);
1281 args->trans = NULL;
1282 xfs_bmap_cancel(args->flist);
1283 goto out;
1284 }
1285
1286 /*
1287 * bmap_finish() may have committed the last trans
1288 * and started a new one. We need the inode to be
1289 * in all transactions.
1290 */
1291 if (committed)
1292 xfs_trans_ijoin(args->trans, dp, 0);
1293 } else
1294 xfs_trans_brelse(args->trans, bp);
1295 }
1296 error = 0;
1297
1298out:
1299 xfs_da_state_free(state);
1300 return error;
1301}
1302
1303/*
1304 * Fill in the disk block numbers in the state structure for the buffers
1305 * that are attached to the state structure.
1306 * This is done so that we can quickly reattach ourselves to those buffers
1307 * after some set of transaction commits have released these buffers.
1308 */
1309STATIC int
1310xfs_attr_fillstate(xfs_da_state_t *state)
1311{
1312 xfs_da_state_path_t *path;
1313 xfs_da_state_blk_t *blk;
1314 int level;
1315
1316 trace_xfs_attr_fillstate(state->args);
1317
1318 /*
1319 * Roll down the "path" in the state structure, storing the on-disk
1320 * block number for those buffers in the "path".
1321 */
1322 path = &state->path;
1323 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1324 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1325 if (blk->bp) {
1326 blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
1327 blk->bp = NULL;
1328 } else {
1329 blk->disk_blkno = 0;
1330 }
1331 }
1332
1333 /*
1334 * Roll down the "altpath" in the state structure, storing the on-disk
1335 * block number for those buffers in the "altpath".
1336 */
1337 path = &state->altpath;
1338 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1339 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1340 if (blk->bp) {
1341 blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
1342 blk->bp = NULL;
1343 } else {
1344 blk->disk_blkno = 0;
1345 }
1346 }
1347
1348 return 0;
1349}
1350
1351/*
1352 * Reattach the buffers to the state structure based on the disk block
1353 * numbers stored in the state structure.
1354 * This is done after some set of transaction commits have released those
1355 * buffers from our grip.
1356 */
1357STATIC int
1358xfs_attr_refillstate(xfs_da_state_t *state)
1359{
1360 xfs_da_state_path_t *path;
1361 xfs_da_state_blk_t *blk;
1362 int level, error;
1363
1364 trace_xfs_attr_refillstate(state->args);
1365
1366 /*
1367 * Roll down the "path" in the state structure, storing the on-disk
1368 * block number for those buffers in the "path".
1369 */
1370 path = &state->path;
1371 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1372 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1373 if (blk->disk_blkno) {
1374 error = xfs_da3_node_read(state->args->trans,
1375 state->args->dp,
1376 blk->blkno, blk->disk_blkno,
1377 &blk->bp, XFS_ATTR_FORK);
1378 if (error)
1379 return error;
1380 } else {
1381 blk->bp = NULL;
1382 }
1383 }
1384
1385 /*
1386 * Roll down the "altpath" in the state structure, storing the on-disk
1387 * block number for those buffers in the "altpath".
1388 */
1389 path = &state->altpath;
1390 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1391 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1392 if (blk->disk_blkno) {
1393 error = xfs_da3_node_read(state->args->trans,
1394 state->args->dp,
1395 blk->blkno, blk->disk_blkno,
1396 &blk->bp, XFS_ATTR_FORK);
1397 if (error)
1398 return error;
1399 } else {
1400 blk->bp = NULL;
1401 }
1402 }
1403
1404 return 0;
1405}
1406
1407/*
1408 * Look up a filename in a node attribute list.
1409 *
1410 * This routine gets called for any attribute fork that has more than one
1411 * block, ie: both true Btree attr lists and for single-leaf-blocks with
1412 * "remote" values taking up more blocks.
1413 */
1414STATIC int
1415xfs_attr_node_get(xfs_da_args_t *args)
1416{
1417 xfs_da_state_t *state;
1418 xfs_da_state_blk_t *blk;
1419 int error, retval;
1420 int i;
1421
1422 trace_xfs_attr_node_get(args);
1423
1424 state = xfs_da_state_alloc();
1425 state->args = args;
1426 state->mp = args->dp->i_mount;
1427
1428 /*
1429 * Search to see if name exists, and get back a pointer to it.
1430 */
1431 error = xfs_da3_node_lookup_int(state, &retval);
1432 if (error) {
1433 retval = error;
1434 } else if (retval == -EEXIST) {
1435 blk = &state->path.blk[ state->path.active-1 ];
1436 ASSERT(blk->bp != NULL);
1437 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
1438
1439 /*
1440 * Get the value, local or "remote"
1441 */
1442 retval = xfs_attr3_leaf_getvalue(blk->bp, args);
1443 if (!retval && (args->rmtblkno > 0)
1444 && !(args->flags & ATTR_KERNOVAL)) {
1445 retval = xfs_attr_rmtval_get(args);
1446 }
1447 }
1448
1449 /*
1450 * If not in a transaction, we have to release all the buffers.
1451 */
1452 for (i = 0; i < state->path.active; i++) {
1453 xfs_trans_brelse(args->trans, state->path.blk[i].bp);
1454 state->path.blk[i].bp = NULL;
1455 }
1456
1457 xfs_da_state_free(state);
1458 return retval;
1459}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
new file mode 100644
index 000000000000..b1f73dbbf3d8
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -0,0 +1,2697 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_shared.h"
22#include "xfs_format.h"
23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h"
25#include "xfs_bit.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_da_format.h"
30#include "xfs_da_btree.h"
31#include "xfs_inode.h"
32#include "xfs_trans.h"
33#include "xfs_inode_item.h"
34#include "xfs_bmap_btree.h"
35#include "xfs_bmap.h"
36#include "xfs_attr_sf.h"
37#include "xfs_attr_remote.h"
38#include "xfs_attr.h"
39#include "xfs_attr_leaf.h"
40#include "xfs_error.h"
41#include "xfs_trace.h"
42#include "xfs_buf_item.h"
43#include "xfs_cksum.h"
44#include "xfs_dinode.h"
45#include "xfs_dir2.h"
46
47
48/*
49 * xfs_attr_leaf.c
50 *
51 * Routines to implement leaf blocks of attributes as Btrees of hashed names.
52 */
53
54/*========================================================================
55 * Function prototypes for the kernel.
56 *========================================================================*/
57
58/*
59 * Routines used for growing the Btree.
60 */
61STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
62 xfs_dablk_t which_block, struct xfs_buf **bpp);
63STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
64 struct xfs_attr3_icleaf_hdr *ichdr,
65 struct xfs_da_args *args, int freemap_index);
66STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
67 struct xfs_attr3_icleaf_hdr *ichdr,
68 struct xfs_buf *leaf_buffer);
69STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
70 xfs_da_state_blk_t *blk1,
71 xfs_da_state_blk_t *blk2);
72STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
73 xfs_da_state_blk_t *leaf_blk_1,
74 struct xfs_attr3_icleaf_hdr *ichdr1,
75 xfs_da_state_blk_t *leaf_blk_2,
76 struct xfs_attr3_icleaf_hdr *ichdr2,
77 int *number_entries_in_blk1,
78 int *number_usedbytes_in_blk1);
79
80/*
81 * Utility routines.
82 */
83STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
84 struct xfs_attr_leafblock *src_leaf,
85 struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
86 struct xfs_attr_leafblock *dst_leaf,
87 struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
88 int move_count);
89STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
90
91void
92xfs_attr3_leaf_hdr_from_disk(
93 struct xfs_attr3_icleaf_hdr *to,
94 struct xfs_attr_leafblock *from)
95{
96 int i;
97
98 ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
99 from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
100
101 if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
102 struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
103
104 to->forw = be32_to_cpu(hdr3->info.hdr.forw);
105 to->back = be32_to_cpu(hdr3->info.hdr.back);
106 to->magic = be16_to_cpu(hdr3->info.hdr.magic);
107 to->count = be16_to_cpu(hdr3->count);
108 to->usedbytes = be16_to_cpu(hdr3->usedbytes);
109 to->firstused = be16_to_cpu(hdr3->firstused);
110 to->holes = hdr3->holes;
111
112 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
113 to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
114 to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
115 }
116 return;
117 }
118 to->forw = be32_to_cpu(from->hdr.info.forw);
119 to->back = be32_to_cpu(from->hdr.info.back);
120 to->magic = be16_to_cpu(from->hdr.info.magic);
121 to->count = be16_to_cpu(from->hdr.count);
122 to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
123 to->firstused = be16_to_cpu(from->hdr.firstused);
124 to->holes = from->hdr.holes;
125
126 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
127 to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
128 to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
129 }
130}
131
132void
133xfs_attr3_leaf_hdr_to_disk(
134 struct xfs_attr_leafblock *to,
135 struct xfs_attr3_icleaf_hdr *from)
136{
137 int i;
138
139 ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
140 from->magic == XFS_ATTR3_LEAF_MAGIC);
141
142 if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
143 struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
144
145 hdr3->info.hdr.forw = cpu_to_be32(from->forw);
146 hdr3->info.hdr.back = cpu_to_be32(from->back);
147 hdr3->info.hdr.magic = cpu_to_be16(from->magic);
148 hdr3->count = cpu_to_be16(from->count);
149 hdr3->usedbytes = cpu_to_be16(from->usedbytes);
150 hdr3->firstused = cpu_to_be16(from->firstused);
151 hdr3->holes = from->holes;
152 hdr3->pad1 = 0;
153
154 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
155 hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
156 hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
157 }
158 return;
159 }
160 to->hdr.info.forw = cpu_to_be32(from->forw);
161 to->hdr.info.back = cpu_to_be32(from->back);
162 to->hdr.info.magic = cpu_to_be16(from->magic);
163 to->hdr.count = cpu_to_be16(from->count);
164 to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
165 to->hdr.firstused = cpu_to_be16(from->firstused);
166 to->hdr.holes = from->holes;
167 to->hdr.pad1 = 0;
168
169 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
170 to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
171 to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
172 }
173}
174
175static bool
176xfs_attr3_leaf_verify(
177 struct xfs_buf *bp)
178{
179 struct xfs_mount *mp = bp->b_target->bt_mount;
180 struct xfs_attr_leafblock *leaf = bp->b_addr;
181 struct xfs_attr3_icleaf_hdr ichdr;
182
183 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
184
185 if (xfs_sb_version_hascrc(&mp->m_sb)) {
186 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
187
188 if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
189 return false;
190
191 if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
192 return false;
193 if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
194 return false;
195 } else {
196 if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
197 return false;
198 }
199 if (ichdr.count == 0)
200 return false;
201
202 /* XXX: need to range check rest of attr header values */
203 /* XXX: hash order check? */
204
205 return true;
206}
207
208static void
209xfs_attr3_leaf_write_verify(
210 struct xfs_buf *bp)
211{
212 struct xfs_mount *mp = bp->b_target->bt_mount;
213 struct xfs_buf_log_item *bip = bp->b_fspriv;
214 struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
215
216 if (!xfs_attr3_leaf_verify(bp)) {
217 xfs_buf_ioerror(bp, -EFSCORRUPTED);
218 xfs_verifier_error(bp);
219 return;
220 }
221
222 if (!xfs_sb_version_hascrc(&mp->m_sb))
223 return;
224
225 if (bip)
226 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
227
228 xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
229}
230
231/*
232 * leaf/node format detection on trees is sketchy, so a node read can be done on
233 * leaf level blocks when detection identifies the tree as a node format tree
234 * incorrectly. In this case, we need to swap the verifier to match the correct
235 * format of the block being read.
236 */
237static void
238xfs_attr3_leaf_read_verify(
239 struct xfs_buf *bp)
240{
241 struct xfs_mount *mp = bp->b_target->bt_mount;
242
243 if (xfs_sb_version_hascrc(&mp->m_sb) &&
244 !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
245 xfs_buf_ioerror(bp, -EFSBADCRC);
246 else if (!xfs_attr3_leaf_verify(bp))
247 xfs_buf_ioerror(bp, -EFSCORRUPTED);
248
249 if (bp->b_error)
250 xfs_verifier_error(bp);
251}
252
253const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
254 .verify_read = xfs_attr3_leaf_read_verify,
255 .verify_write = xfs_attr3_leaf_write_verify,
256};
257
258int
259xfs_attr3_leaf_read(
260 struct xfs_trans *tp,
261 struct xfs_inode *dp,
262 xfs_dablk_t bno,
263 xfs_daddr_t mappedbno,
264 struct xfs_buf **bpp)
265{
266 int err;
267
268 err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
269 XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
270 if (!err && tp)
271 xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
272 return err;
273}
274
275/*========================================================================
276 * Namespace helper routines
277 *========================================================================*/
278
279/*
280 * If namespace bits don't match return 0.
281 * If all match then return 1.
282 */
283STATIC int
284xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
285{
286 return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
287}
288
289
290/*========================================================================
291 * External routines when attribute fork size < XFS_LITINO(mp).
292 *========================================================================*/
293
294/*
295 * Query whether the requested number of additional bytes of extended
296 * attribute space will be able to fit inline.
297 *
298 * Returns zero if not, else the di_forkoff fork offset to be used in the
299 * literal area for attribute data once the new bytes have been added.
300 *
301 * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
302 * special case for dev/uuid inodes, they have fixed size data forks.
303 */
304int
305xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
306{
307 int offset;
308 int minforkoff; /* lower limit on valid forkoff locations */
309 int maxforkoff; /* upper limit on valid forkoff locations */
310 int dsize;
311 xfs_mount_t *mp = dp->i_mount;
312
313 /* rounded down */
314 offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
315
316 switch (dp->i_d.di_format) {
317 case XFS_DINODE_FMT_DEV:
318 minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
319 return (offset >= minforkoff) ? minforkoff : 0;
320 case XFS_DINODE_FMT_UUID:
321 minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
322 return (offset >= minforkoff) ? minforkoff : 0;
323 }
324
325 /*
326 * If the requested numbers of bytes is smaller or equal to the
327 * current attribute fork size we can always proceed.
328 *
329 * Note that if_bytes in the data fork might actually be larger than
330 * the current data fork size is due to delalloc extents. In that
331 * case either the extent count will go down when they are converted
332 * to real extents, or the delalloc conversion will take care of the
333 * literal area rebalancing.
334 */
335 if (bytes <= XFS_IFORK_ASIZE(dp))
336 return dp->i_d.di_forkoff;
337
338 /*
339 * For attr2 we can try to move the forkoff if there is space in the
340 * literal area, but for the old format we are done if there is no
341 * space in the fixed attribute fork.
342 */
343 if (!(mp->m_flags & XFS_MOUNT_ATTR2))
344 return 0;
345
346 dsize = dp->i_df.if_bytes;
347
348 switch (dp->i_d.di_format) {
349 case XFS_DINODE_FMT_EXTENTS:
350 /*
351 * If there is no attr fork and the data fork is extents,
352 * determine if creating the default attr fork will result
353 * in the extents form migrating to btree. If so, the
354 * minimum offset only needs to be the space required for
355 * the btree root.
356 */
357 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
358 xfs_default_attroffset(dp))
359 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
360 break;
361 case XFS_DINODE_FMT_BTREE:
362 /*
363 * If we have a data btree then keep forkoff if we have one,
364 * otherwise we are adding a new attr, so then we set
365 * minforkoff to where the btree root can finish so we have
366 * plenty of room for attrs
367 */
368 if (dp->i_d.di_forkoff) {
369 if (offset < dp->i_d.di_forkoff)
370 return 0;
371 return dp->i_d.di_forkoff;
372 }
373 dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
374 break;
375 }
376
377 /*
378 * A data fork btree root must have space for at least
379 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
380 */
381 minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
382 minforkoff = roundup(minforkoff, 8) >> 3;
383
384 /* attr fork btree root can have at least this many key/ptr pairs */
385 maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
386 XFS_BMDR_SPACE_CALC(MINABTPTRS);
387 maxforkoff = maxforkoff >> 3; /* rounded down */
388
389 if (offset >= maxforkoff)
390 return maxforkoff;
391 if (offset >= minforkoff)
392 return offset;
393 return 0;
394}
395
396/*
397 * Switch on the ATTR2 superblock bit (implies also FEATURES2)
398 */
399STATIC void
400xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
401{
402 if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
403 !(xfs_sb_version_hasattr2(&mp->m_sb))) {
404 spin_lock(&mp->m_sb_lock);
405 if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
406 xfs_sb_version_addattr2(&mp->m_sb);
407 spin_unlock(&mp->m_sb_lock);
408 xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
409 } else
410 spin_unlock(&mp->m_sb_lock);
411 }
412}
413
414/*
415 * Create the initial contents of a shortform attribute list.
416 */
417void
418xfs_attr_shortform_create(xfs_da_args_t *args)
419{
420 xfs_attr_sf_hdr_t *hdr;
421 xfs_inode_t *dp;
422 xfs_ifork_t *ifp;
423
424 trace_xfs_attr_sf_create(args);
425
426 dp = args->dp;
427 ASSERT(dp != NULL);
428 ifp = dp->i_afp;
429 ASSERT(ifp != NULL);
430 ASSERT(ifp->if_bytes == 0);
431 if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
432 ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */
433 dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
434 ifp->if_flags |= XFS_IFINLINE;
435 } else {
436 ASSERT(ifp->if_flags & XFS_IFINLINE);
437 }
438 xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
439 hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
440 hdr->count = 0;
441 hdr->totsize = cpu_to_be16(sizeof(*hdr));
442 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
443}
444
445/*
446 * Add a name/value pair to the shortform attribute list.
447 * Overflow from the inode has already been checked for.
448 */
449void
450xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
451{
452 xfs_attr_shortform_t *sf;
453 xfs_attr_sf_entry_t *sfe;
454 int i, offset, size;
455 xfs_mount_t *mp;
456 xfs_inode_t *dp;
457 xfs_ifork_t *ifp;
458
459 trace_xfs_attr_sf_add(args);
460
461 dp = args->dp;
462 mp = dp->i_mount;
463 dp->i_d.di_forkoff = forkoff;
464
465 ifp = dp->i_afp;
466 ASSERT(ifp->if_flags & XFS_IFINLINE);
467 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
468 sfe = &sf->list[0];
469 for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
470#ifdef DEBUG
471 if (sfe->namelen != args->namelen)
472 continue;
473 if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
474 continue;
475 if (!xfs_attr_namesp_match(args->flags, sfe->flags))
476 continue;
477 ASSERT(0);
478#endif
479 }
480
481 offset = (char *)sfe - (char *)sf;
482 size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
483 xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
484 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
485 sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
486
487 sfe->namelen = args->namelen;
488 sfe->valuelen = args->valuelen;
489 sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
490 memcpy(sfe->nameval, args->name, args->namelen);
491 memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
492 sf->hdr.count++;
493 be16_add_cpu(&sf->hdr.totsize, size);
494 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
495
496 xfs_sbversion_add_attr2(mp, args->trans);
497}
498
499/*
500 * After the last attribute is removed revert to original inode format,
501 * making all literal area available to the data fork once more.
502 */
503STATIC void
504xfs_attr_fork_reset(
505 struct xfs_inode *ip,
506 struct xfs_trans *tp)
507{
508 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
509 ip->i_d.di_forkoff = 0;
510 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
511
512 ASSERT(ip->i_d.di_anextents == 0);
513 ASSERT(ip->i_afp == NULL);
514
515 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
516}
517
518/*
519 * Remove an attribute from the shortform attribute list structure.
520 */
521int
522xfs_attr_shortform_remove(xfs_da_args_t *args)
523{
524 xfs_attr_shortform_t *sf;
525 xfs_attr_sf_entry_t *sfe;
526 int base, size=0, end, totsize, i;
527 xfs_mount_t *mp;
528 xfs_inode_t *dp;
529
530 trace_xfs_attr_sf_remove(args);
531
532 dp = args->dp;
533 mp = dp->i_mount;
534 base = sizeof(xfs_attr_sf_hdr_t);
535 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
536 sfe = &sf->list[0];
537 end = sf->hdr.count;
538 for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
539 base += size, i++) {
540 size = XFS_ATTR_SF_ENTSIZE(sfe);
541 if (sfe->namelen != args->namelen)
542 continue;
543 if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
544 continue;
545 if (!xfs_attr_namesp_match(args->flags, sfe->flags))
546 continue;
547 break;
548 }
549 if (i == end)
550 return -ENOATTR;
551
552 /*
553 * Fix up the attribute fork data, covering the hole
554 */
555 end = base + size;
556 totsize = be16_to_cpu(sf->hdr.totsize);
557 if (end != totsize)
558 memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
559 sf->hdr.count--;
560 be16_add_cpu(&sf->hdr.totsize, -size);
561
562 /*
563 * Fix up the start offset of the attribute fork
564 */
565 totsize -= size;
566 if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
567 (mp->m_flags & XFS_MOUNT_ATTR2) &&
568 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
569 !(args->op_flags & XFS_DA_OP_ADDNAME)) {
570 xfs_attr_fork_reset(dp, args->trans);
571 } else {
572 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
573 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
574 ASSERT(dp->i_d.di_forkoff);
575 ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
576 (args->op_flags & XFS_DA_OP_ADDNAME) ||
577 !(mp->m_flags & XFS_MOUNT_ATTR2) ||
578 dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
579 xfs_trans_log_inode(args->trans, dp,
580 XFS_ILOG_CORE | XFS_ILOG_ADATA);
581 }
582
583 xfs_sbversion_add_attr2(mp, args->trans);
584
585 return 0;
586}
587
588/*
589 * Look up a name in a shortform attribute list structure.
590 */
591/*ARGSUSED*/
592int
593xfs_attr_shortform_lookup(xfs_da_args_t *args)
594{
595 xfs_attr_shortform_t *sf;
596 xfs_attr_sf_entry_t *sfe;
597 int i;
598 xfs_ifork_t *ifp;
599
600 trace_xfs_attr_sf_lookup(args);
601
602 ifp = args->dp->i_afp;
603 ASSERT(ifp->if_flags & XFS_IFINLINE);
604 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
605 sfe = &sf->list[0];
606 for (i = 0; i < sf->hdr.count;
607 sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
608 if (sfe->namelen != args->namelen)
609 continue;
610 if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
611 continue;
612 if (!xfs_attr_namesp_match(args->flags, sfe->flags))
613 continue;
614 return -EEXIST;
615 }
616 return -ENOATTR;
617}
618
619/*
620 * Look up a name in a shortform attribute list structure.
621 */
622/*ARGSUSED*/
623int
624xfs_attr_shortform_getvalue(xfs_da_args_t *args)
625{
626 xfs_attr_shortform_t *sf;
627 xfs_attr_sf_entry_t *sfe;
628 int i;
629
630 ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
631 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
632 sfe = &sf->list[0];
633 for (i = 0; i < sf->hdr.count;
634 sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
635 if (sfe->namelen != args->namelen)
636 continue;
637 if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
638 continue;
639 if (!xfs_attr_namesp_match(args->flags, sfe->flags))
640 continue;
641 if (args->flags & ATTR_KERNOVAL) {
642 args->valuelen = sfe->valuelen;
643 return -EEXIST;
644 }
645 if (args->valuelen < sfe->valuelen) {
646 args->valuelen = sfe->valuelen;
647 return -ERANGE;
648 }
649 args->valuelen = sfe->valuelen;
650 memcpy(args->value, &sfe->nameval[args->namelen],
651 args->valuelen);
652 return -EEXIST;
653 }
654 return -ENOATTR;
655}
656
657/*
658 * Convert from using the shortform to the leaf.
659 */
660int
661xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
662{
663 xfs_inode_t *dp;
664 xfs_attr_shortform_t *sf;
665 xfs_attr_sf_entry_t *sfe;
666 xfs_da_args_t nargs;
667 char *tmpbuffer;
668 int error, i, size;
669 xfs_dablk_t blkno;
670 struct xfs_buf *bp;
671 xfs_ifork_t *ifp;
672
673 trace_xfs_attr_sf_to_leaf(args);
674
675 dp = args->dp;
676 ifp = dp->i_afp;
677 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
678 size = be16_to_cpu(sf->hdr.totsize);
679 tmpbuffer = kmem_alloc(size, KM_SLEEP);
680 ASSERT(tmpbuffer != NULL);
681 memcpy(tmpbuffer, ifp->if_u1.if_data, size);
682 sf = (xfs_attr_shortform_t *)tmpbuffer;
683
684 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
685 xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
686
687 bp = NULL;
688 error = xfs_da_grow_inode(args, &blkno);
689 if (error) {
690 /*
691 * If we hit an IO error middle of the transaction inside
692 * grow_inode(), we may have inconsistent data. Bail out.
693 */
694 if (error == -EIO)
695 goto out;
696 xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
697 memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
698 goto out;
699 }
700
701 ASSERT(blkno == 0);
702 error = xfs_attr3_leaf_create(args, blkno, &bp);
703 if (error) {
704 error = xfs_da_shrink_inode(args, 0, bp);
705 bp = NULL;
706 if (error)
707 goto out;
708 xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
709 memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
710 goto out;
711 }
712
713 memset((char *)&nargs, 0, sizeof(nargs));
714 nargs.dp = dp;
715 nargs.geo = args->geo;
716 nargs.firstblock = args->firstblock;
717 nargs.flist = args->flist;
718 nargs.total = args->total;
719 nargs.whichfork = XFS_ATTR_FORK;
720 nargs.trans = args->trans;
721 nargs.op_flags = XFS_DA_OP_OKNOENT;
722
723 sfe = &sf->list[0];
724 for (i = 0; i < sf->hdr.count; i++) {
725 nargs.name = sfe->nameval;
726 nargs.namelen = sfe->namelen;
727 nargs.value = &sfe->nameval[nargs.namelen];
728 nargs.valuelen = sfe->valuelen;
729 nargs.hashval = xfs_da_hashname(sfe->nameval,
730 sfe->namelen);
731 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
732 error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
733 ASSERT(error == -ENOATTR);
734 error = xfs_attr3_leaf_add(bp, &nargs);
735 ASSERT(error != -ENOSPC);
736 if (error)
737 goto out;
738 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
739 }
740 error = 0;
741
742out:
743 kmem_free(tmpbuffer);
744 return error;
745}
746
747/*
748 * Check a leaf attribute block to see if all the entries would fit into
749 * a shortform attribute list.
750 */
751int
752xfs_attr_shortform_allfit(
753 struct xfs_buf *bp,
754 struct xfs_inode *dp)
755{
756 struct xfs_attr_leafblock *leaf;
757 struct xfs_attr_leaf_entry *entry;
758 xfs_attr_leaf_name_local_t *name_loc;
759 struct xfs_attr3_icleaf_hdr leafhdr;
760 int bytes;
761 int i;
762
763 leaf = bp->b_addr;
764 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
765 entry = xfs_attr3_leaf_entryp(leaf);
766
767 bytes = sizeof(struct xfs_attr_sf_hdr);
768 for (i = 0; i < leafhdr.count; entry++, i++) {
769 if (entry->flags & XFS_ATTR_INCOMPLETE)
770 continue; /* don't copy partial entries */
771 if (!(entry->flags & XFS_ATTR_LOCAL))
772 return 0;
773 name_loc = xfs_attr3_leaf_name_local(leaf, i);
774 if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
775 return 0;
776 if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
777 return 0;
778 bytes += sizeof(struct xfs_attr_sf_entry) - 1
779 + name_loc->namelen
780 + be16_to_cpu(name_loc->valuelen);
781 }
782 if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
783 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
784 (bytes == sizeof(struct xfs_attr_sf_hdr)))
785 return -1;
786 return xfs_attr_shortform_bytesfit(dp, bytes);
787}
788
789/*
790 * Convert a leaf attribute list to shortform attribute list
791 */
792int
793xfs_attr3_leaf_to_shortform(
794 struct xfs_buf *bp,
795 struct xfs_da_args *args,
796 int forkoff)
797{
798 struct xfs_attr_leafblock *leaf;
799 struct xfs_attr3_icleaf_hdr ichdr;
800 struct xfs_attr_leaf_entry *entry;
801 struct xfs_attr_leaf_name_local *name_loc;
802 struct xfs_da_args nargs;
803 struct xfs_inode *dp = args->dp;
804 char *tmpbuffer;
805 int error;
806 int i;
807
808 trace_xfs_attr_leaf_to_sf(args);
809
810 tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
811 if (!tmpbuffer)
812 return -ENOMEM;
813
814 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
815
816 leaf = (xfs_attr_leafblock_t *)tmpbuffer;
817 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
818 entry = xfs_attr3_leaf_entryp(leaf);
819
820 /* XXX (dgc): buffer is about to be marked stale - why zero it? */
821 memset(bp->b_addr, 0, args->geo->blksize);
822
823 /*
824 * Clean out the prior contents of the attribute list.
825 */
826 error = xfs_da_shrink_inode(args, 0, bp);
827 if (error)
828 goto out;
829
830 if (forkoff == -1) {
831 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
832 ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
833 xfs_attr_fork_reset(dp, args->trans);
834 goto out;
835 }
836
837 xfs_attr_shortform_create(args);
838
839 /*
840 * Copy the attributes
841 */
842 memset((char *)&nargs, 0, sizeof(nargs));
843 nargs.geo = args->geo;
844 nargs.dp = dp;
845 nargs.firstblock = args->firstblock;
846 nargs.flist = args->flist;
847 nargs.total = args->total;
848 nargs.whichfork = XFS_ATTR_FORK;
849 nargs.trans = args->trans;
850 nargs.op_flags = XFS_DA_OP_OKNOENT;
851
852 for (i = 0; i < ichdr.count; entry++, i++) {
853 if (entry->flags & XFS_ATTR_INCOMPLETE)
854 continue; /* don't copy partial entries */
855 if (!entry->nameidx)
856 continue;
857 ASSERT(entry->flags & XFS_ATTR_LOCAL);
858 name_loc = xfs_attr3_leaf_name_local(leaf, i);
859 nargs.name = name_loc->nameval;
860 nargs.namelen = name_loc->namelen;
861 nargs.value = &name_loc->nameval[nargs.namelen];
862 nargs.valuelen = be16_to_cpu(name_loc->valuelen);
863 nargs.hashval = be32_to_cpu(entry->hashval);
864 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
865 xfs_attr_shortform_add(&nargs, forkoff);
866 }
867 error = 0;
868
869out:
870 kmem_free(tmpbuffer);
871 return error;
872}
873
874/*
875 * Convert from using a single leaf to a root node and a leaf.
876 */
877int
878xfs_attr3_leaf_to_node(
879 struct xfs_da_args *args)
880{
881 struct xfs_attr_leafblock *leaf;
882 struct xfs_attr3_icleaf_hdr icleafhdr;
883 struct xfs_attr_leaf_entry *entries;
884 struct xfs_da_node_entry *btree;
885 struct xfs_da3_icnode_hdr icnodehdr;
886 struct xfs_da_intnode *node;
887 struct xfs_inode *dp = args->dp;
888 struct xfs_mount *mp = dp->i_mount;
889 struct xfs_buf *bp1 = NULL;
890 struct xfs_buf *bp2 = NULL;
891 xfs_dablk_t blkno;
892 int error;
893
894 trace_xfs_attr_leaf_to_node(args);
895
896 error = xfs_da_grow_inode(args, &blkno);
897 if (error)
898 goto out;
899 error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
900 if (error)
901 goto out;
902
903 error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
904 if (error)
905 goto out;
906
907 /* copy leaf to new buffer, update identifiers */
908 xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
909 bp2->b_ops = bp1->b_ops;
910 memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
911 if (xfs_sb_version_hascrc(&mp->m_sb)) {
912 struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
913 hdr3->blkno = cpu_to_be64(bp2->b_bn);
914 }
915 xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
916
917 /*
918 * Set up the new root node.
919 */
920 error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
921 if (error)
922 goto out;
923 node = bp1->b_addr;
924 dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
925 btree = dp->d_ops->node_tree_p(node);
926
927 leaf = bp2->b_addr;
928 xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
929 entries = xfs_attr3_leaf_entryp(leaf);
930
931 /* both on-disk, don't endian-flip twice */
932 btree[0].hashval = entries[icleafhdr.count - 1].hashval;
933 btree[0].before = cpu_to_be32(blkno);
934 icnodehdr.count = 1;
935 dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
936 xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
937 error = 0;
938out:
939 return error;
940}
941
942/*========================================================================
943 * Routines used for growing the Btree.
944 *========================================================================*/
945
946/*
947 * Create the initial contents of a leaf attribute list
948 * or a leaf in a node attribute list.
949 */
950STATIC int
951xfs_attr3_leaf_create(
952 struct xfs_da_args *args,
953 xfs_dablk_t blkno,
954 struct xfs_buf **bpp)
955{
956 struct xfs_attr_leafblock *leaf;
957 struct xfs_attr3_icleaf_hdr ichdr;
958 struct xfs_inode *dp = args->dp;
959 struct xfs_mount *mp = dp->i_mount;
960 struct xfs_buf *bp;
961 int error;
962
963 trace_xfs_attr_leaf_create(args);
964
965 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
966 XFS_ATTR_FORK);
967 if (error)
968 return error;
969 bp->b_ops = &xfs_attr3_leaf_buf_ops;
970 xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
971 leaf = bp->b_addr;
972 memset(leaf, 0, args->geo->blksize);
973
974 memset(&ichdr, 0, sizeof(ichdr));
975 ichdr.firstused = args->geo->blksize;
976
977 if (xfs_sb_version_hascrc(&mp->m_sb)) {
978 struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
979
980 ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
981
982 hdr3->blkno = cpu_to_be64(bp->b_bn);
983 hdr3->owner = cpu_to_be64(dp->i_ino);
984 uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
985
986 ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
987 } else {
988 ichdr.magic = XFS_ATTR_LEAF_MAGIC;
989 ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
990 }
991 ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
992
993 xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
994 xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
995
996 *bpp = bp;
997 return 0;
998}
999
1000/*
1001 * Split the leaf node, rebalance, then add the new entry.
1002 */
1003int
1004xfs_attr3_leaf_split(
1005 struct xfs_da_state *state,
1006 struct xfs_da_state_blk *oldblk,
1007 struct xfs_da_state_blk *newblk)
1008{
1009 xfs_dablk_t blkno;
1010 int error;
1011
1012 trace_xfs_attr_leaf_split(state->args);
1013
1014 /*
1015 * Allocate space for a new leaf node.
1016 */
1017 ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
1018 error = xfs_da_grow_inode(state->args, &blkno);
1019 if (error)
1020 return error;
1021 error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
1022 if (error)
1023 return error;
1024 newblk->blkno = blkno;
1025 newblk->magic = XFS_ATTR_LEAF_MAGIC;
1026
1027 /*
1028 * Rebalance the entries across the two leaves.
1029 * NOTE: rebalance() currently depends on the 2nd block being empty.
1030 */
1031 xfs_attr3_leaf_rebalance(state, oldblk, newblk);
1032 error = xfs_da3_blk_link(state, oldblk, newblk);
1033 if (error)
1034 return error;
1035
1036 /*
1037 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
1038 * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
1039 * "new" attrs info. Will need the "old" info to remove it later.
1040 *
1041 * Insert the "new" entry in the correct block.
1042 */
1043 if (state->inleaf) {
1044 trace_xfs_attr_leaf_add_old(state->args);
1045 error = xfs_attr3_leaf_add(oldblk->bp, state->args);
1046 } else {
1047 trace_xfs_attr_leaf_add_new(state->args);
1048 error = xfs_attr3_leaf_add(newblk->bp, state->args);
1049 }
1050
1051 /*
1052 * Update last hashval in each block since we added the name.
1053 */
1054 oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
1055 newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
1056 return error;
1057}
1058
1059/*
1060 * Add a name to the leaf attribute list structure.
1061 */
1062int
1063xfs_attr3_leaf_add(
1064 struct xfs_buf *bp,
1065 struct xfs_da_args *args)
1066{
1067 struct xfs_attr_leafblock *leaf;
1068 struct xfs_attr3_icleaf_hdr ichdr;
1069 int tablesize;
1070 int entsize;
1071 int sum;
1072 int tmp;
1073 int i;
1074
1075 trace_xfs_attr_leaf_add(args);
1076
1077 leaf = bp->b_addr;
1078 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
1079 ASSERT(args->index >= 0 && args->index <= ichdr.count);
1080 entsize = xfs_attr_leaf_newentsize(args, NULL);
1081
1082 /*
1083 * Search through freemap for first-fit on new name length.
1084 * (may need to figure in size of entry struct too)
1085 */
1086 tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
1087 + xfs_attr3_leaf_hdr_size(leaf);
1088 for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
1089 if (tablesize > ichdr.firstused) {
1090 sum += ichdr.freemap[i].size;
1091 continue;
1092 }
1093 if (!ichdr.freemap[i].size)
1094 continue; /* no space in this map */
1095 tmp = entsize;
1096 if (ichdr.freemap[i].base < ichdr.firstused)
1097 tmp += sizeof(xfs_attr_leaf_entry_t);
1098 if (ichdr.freemap[i].size >= tmp) {
1099 tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
1100 goto out_log_hdr;
1101 }
1102 sum += ichdr.freemap[i].size;
1103 }
1104
1105 /*
1106 * If there are no holes in the address space of the block,
1107 * and we don't have enough freespace, then compaction will do us
1108 * no good and we should just give up.
1109 */
1110 if (!ichdr.holes && sum < entsize)
1111 return -ENOSPC;
1112
1113 /*
1114 * Compact the entries to coalesce free space.
1115 * This may change the hdr->count via dropping INCOMPLETE entries.
1116 */
1117 xfs_attr3_leaf_compact(args, &ichdr, bp);
1118
1119 /*
1120 * After compaction, the block is guaranteed to have only one
1121 * free region, in freemap[0]. If it is not big enough, give up.
1122 */
1123 if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
1124 tmp = -ENOSPC;
1125 goto out_log_hdr;
1126 }
1127
1128 tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
1129
1130out_log_hdr:
1131 xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
1132 xfs_trans_log_buf(args->trans, bp,
1133 XFS_DA_LOGRANGE(leaf, &leaf->hdr,
1134 xfs_attr3_leaf_hdr_size(leaf)));
1135 return tmp;
1136}
1137
1138/*
1139 * Add a name to a leaf attribute list structure.
1140 */
1141STATIC int
1142xfs_attr3_leaf_add_work(
1143 struct xfs_buf *bp,
1144 struct xfs_attr3_icleaf_hdr *ichdr,
1145 struct xfs_da_args *args,
1146 int mapindex)
1147{
1148 struct xfs_attr_leafblock *leaf;
1149 struct xfs_attr_leaf_entry *entry;
1150 struct xfs_attr_leaf_name_local *name_loc;
1151 struct xfs_attr_leaf_name_remote *name_rmt;
1152 struct xfs_mount *mp;
1153 int tmp;
1154 int i;
1155
1156 trace_xfs_attr_leaf_add_work(args);
1157
1158 leaf = bp->b_addr;
1159 ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
1160 ASSERT(args->index >= 0 && args->index <= ichdr->count);
1161
1162 /*
1163 * Force open some space in the entry array and fill it in.
1164 */
1165 entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
1166 if (args->index < ichdr->count) {
1167 tmp = ichdr->count - args->index;
1168 tmp *= sizeof(xfs_attr_leaf_entry_t);
1169 memmove(entry + 1, entry, tmp);
1170 xfs_trans_log_buf(args->trans, bp,
1171 XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
1172 }
1173 ichdr->count++;
1174
1175 /*
1176 * Allocate space for the new string (at the end of the run).
1177 */
1178 mp = args->trans->t_mountp;
1179 ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
1180 ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
1181 ASSERT(ichdr->freemap[mapindex].size >=
1182 xfs_attr_leaf_newentsize(args, NULL));
1183 ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
1184 ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
1185
1186 ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
1187
1188 entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
1189 ichdr->freemap[mapindex].size);
1190 entry->hashval = cpu_to_be32(args->hashval);
1191 entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
1192 entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
1193 if (args->op_flags & XFS_DA_OP_RENAME) {
1194 entry->flags |= XFS_ATTR_INCOMPLETE;
1195 if ((args->blkno2 == args->blkno) &&
1196 (args->index2 <= args->index)) {
1197 args->index2++;
1198 }
1199 }
1200 xfs_trans_log_buf(args->trans, bp,
1201 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
1202 ASSERT((args->index == 0) ||
1203 (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
1204 ASSERT((args->index == ichdr->count - 1) ||
1205 (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
1206
1207 /*
1208 * For "remote" attribute values, simply note that we need to
1209 * allocate space for the "remote" value. We can't actually
1210 * allocate the extents in this transaction, and we can't decide
1211 * which blocks they should be as we might allocate more blocks
1212 * as part of this transaction (a split operation for example).
1213 */
1214 if (entry->flags & XFS_ATTR_LOCAL) {
1215 name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
1216 name_loc->namelen = args->namelen;
1217 name_loc->valuelen = cpu_to_be16(args->valuelen);
1218 memcpy((char *)name_loc->nameval, args->name, args->namelen);
1219 memcpy((char *)&name_loc->nameval[args->namelen], args->value,
1220 be16_to_cpu(name_loc->valuelen));
1221 } else {
1222 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
1223 name_rmt->namelen = args->namelen;
1224 memcpy((char *)name_rmt->name, args->name, args->namelen);
1225 entry->flags |= XFS_ATTR_INCOMPLETE;
1226 /* just in case */
1227 name_rmt->valuelen = 0;
1228 name_rmt->valueblk = 0;
1229 args->rmtblkno = 1;
1230 args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
1231 args->rmtvaluelen = args->valuelen;
1232 }
1233 xfs_trans_log_buf(args->trans, bp,
1234 XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
1235 xfs_attr_leaf_entsize(leaf, args->index)));
1236
1237 /*
1238 * Update the control info for this leaf node
1239 */
1240 if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
1241 ichdr->firstused = be16_to_cpu(entry->nameidx);
1242
1243 ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
1244 + xfs_attr3_leaf_hdr_size(leaf));
1245 tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
1246 + xfs_attr3_leaf_hdr_size(leaf);
1247
1248 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
1249 if (ichdr->freemap[i].base == tmp) {
1250 ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
1251 ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
1252 }
1253 }
1254 ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
1255 return 0;
1256}
1257
1258/*
1259 * Garbage collect a leaf attribute list block by copying it to a new buffer.
1260 */
1261STATIC void
1262xfs_attr3_leaf_compact(
1263 struct xfs_da_args *args,
1264 struct xfs_attr3_icleaf_hdr *ichdr_dst,
1265 struct xfs_buf *bp)
1266{
1267 struct xfs_attr_leafblock *leaf_src;
1268 struct xfs_attr_leafblock *leaf_dst;
1269 struct xfs_attr3_icleaf_hdr ichdr_src;
1270 struct xfs_trans *trans = args->trans;
1271 char *tmpbuffer;
1272
1273 trace_xfs_attr_leaf_compact(args);
1274
1275 tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
1276 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
1277 memset(bp->b_addr, 0, args->geo->blksize);
1278 leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
1279 leaf_dst = bp->b_addr;
1280
1281 /*
1282 * Copy the on-disk header back into the destination buffer to ensure
1283 * all the information in the header that is not part of the incore
1284 * header structure is preserved.
1285 */
1286 memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
1287
1288 /* Initialise the incore headers */
1289 ichdr_src = *ichdr_dst; /* struct copy */
1290 ichdr_dst->firstused = args->geo->blksize;
1291 ichdr_dst->usedbytes = 0;
1292 ichdr_dst->count = 0;
1293 ichdr_dst->holes = 0;
1294 ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
1295 ichdr_dst->freemap[0].size = ichdr_dst->firstused -
1296 ichdr_dst->freemap[0].base;
1297
1298 /* write the header back to initialise the underlying buffer */
1299 xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
1300
1301 /*
1302 * Copy all entry's in the same (sorted) order,
1303 * but allocate name/value pairs packed and in sequence.
1304 */
1305 xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
1306 leaf_dst, ichdr_dst, 0, ichdr_src.count);
1307 /*
1308 * this logs the entire buffer, but the caller must write the header
1309 * back to the buffer when it is finished modifying it.
1310 */
1311 xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
1312
1313 kmem_free(tmpbuffer);
1314}
1315
1316/*
1317 * Compare two leaf blocks "order".
1318 * Return 0 unless leaf2 should go before leaf1.
1319 */
1320static int
1321xfs_attr3_leaf_order(
1322 struct xfs_buf *leaf1_bp,
1323 struct xfs_attr3_icleaf_hdr *leaf1hdr,
1324 struct xfs_buf *leaf2_bp,
1325 struct xfs_attr3_icleaf_hdr *leaf2hdr)
1326{
1327 struct xfs_attr_leaf_entry *entries1;
1328 struct xfs_attr_leaf_entry *entries2;
1329
1330 entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
1331 entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
1332 if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
1333 ((be32_to_cpu(entries2[0].hashval) <
1334 be32_to_cpu(entries1[0].hashval)) ||
1335 (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
1336 be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
1337 return 1;
1338 }
1339 return 0;
1340}
1341
1342int
1343xfs_attr_leaf_order(
1344 struct xfs_buf *leaf1_bp,
1345 struct xfs_buf *leaf2_bp)
1346{
1347 struct xfs_attr3_icleaf_hdr ichdr1;
1348 struct xfs_attr3_icleaf_hdr ichdr2;
1349
1350 xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
1351 xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
1352 return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
1353}
1354
1355/*
1356 * Redistribute the attribute list entries between two leaf nodes,
1357 * taking into account the size of the new entry.
1358 *
1359 * NOTE: if new block is empty, then it will get the upper half of the
1360 * old block. At present, all (one) callers pass in an empty second block.
1361 *
1362 * This code adjusts the args->index/blkno and args->index2/blkno2 fields
1363 * to match what it is doing in splitting the attribute leaf block. Those
1364 * values are used in "atomic rename" operations on attributes. Note that
1365 * the "new" and "old" values can end up in different blocks.
1366 */
1367STATIC void
1368xfs_attr3_leaf_rebalance(
1369 struct xfs_da_state *state,
1370 struct xfs_da_state_blk *blk1,
1371 struct xfs_da_state_blk *blk2)
1372{
1373 struct xfs_da_args *args;
1374 struct xfs_attr_leafblock *leaf1;
1375 struct xfs_attr_leafblock *leaf2;
1376 struct xfs_attr3_icleaf_hdr ichdr1;
1377 struct xfs_attr3_icleaf_hdr ichdr2;
1378 struct xfs_attr_leaf_entry *entries1;
1379 struct xfs_attr_leaf_entry *entries2;
1380 int count;
1381 int totallen;
1382 int max;
1383 int space;
1384 int swap;
1385
1386 /*
1387 * Set up environment.
1388 */
1389 ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
1390 ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
1391 leaf1 = blk1->bp->b_addr;
1392 leaf2 = blk2->bp->b_addr;
1393 xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
1394 xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
1395 ASSERT(ichdr2.count == 0);
1396 args = state->args;
1397
1398 trace_xfs_attr_leaf_rebalance(args);
1399
1400 /*
1401 * Check ordering of blocks, reverse if it makes things simpler.
1402 *
1403 * NOTE: Given that all (current) callers pass in an empty
1404 * second block, this code should never set "swap".
1405 */
1406 swap = 0;
1407 if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
1408 struct xfs_da_state_blk *tmp_blk;
1409 struct xfs_attr3_icleaf_hdr tmp_ichdr;
1410
1411 tmp_blk = blk1;
1412 blk1 = blk2;
1413 blk2 = tmp_blk;
1414
1415 /* struct copies to swap them rather than reconverting */
1416 tmp_ichdr = ichdr1;
1417 ichdr1 = ichdr2;
1418 ichdr2 = tmp_ichdr;
1419
1420 leaf1 = blk1->bp->b_addr;
1421 leaf2 = blk2->bp->b_addr;
1422 swap = 1;
1423 }
1424
1425 /*
1426 * Examine entries until we reduce the absolute difference in
1427 * byte usage between the two blocks to a minimum. Then get
1428 * the direction to copy and the number of elements to move.
1429 *
1430 * "inleaf" is true if the new entry should be inserted into blk1.
1431 * If "swap" is also true, then reverse the sense of "inleaf".
1432 */
1433 state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
1434 blk2, &ichdr2,
1435 &count, &totallen);
1436 if (swap)
1437 state->inleaf = !state->inleaf;
1438
1439 /*
1440 * Move any entries required from leaf to leaf:
1441 */
1442 if (count < ichdr1.count) {
1443 /*
1444 * Figure the total bytes to be added to the destination leaf.
1445 */
1446 /* number entries being moved */
1447 count = ichdr1.count - count;
1448 space = ichdr1.usedbytes - totallen;
1449 space += count * sizeof(xfs_attr_leaf_entry_t);
1450
1451 /*
1452 * leaf2 is the destination, compact it if it looks tight.
1453 */
1454 max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
1455 max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
1456 if (space > max)
1457 xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
1458
1459 /*
1460 * Move high entries from leaf1 to low end of leaf2.
1461 */
1462 xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
1463 ichdr1.count - count, leaf2, &ichdr2, 0, count);
1464
1465 } else if (count > ichdr1.count) {
1466 /*
1467 * I assert that since all callers pass in an empty
1468 * second buffer, this code should never execute.
1469 */
1470 ASSERT(0);
1471
1472 /*
1473 * Figure the total bytes to be added to the destination leaf.
1474 */
1475 /* number entries being moved */
1476 count -= ichdr1.count;
1477 space = totallen - ichdr1.usedbytes;
1478 space += count * sizeof(xfs_attr_leaf_entry_t);
1479
1480 /*
1481 * leaf1 is the destination, compact it if it looks tight.
1482 */
1483 max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
1484 max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
1485 if (space > max)
1486 xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
1487
1488 /*
1489 * Move low entries from leaf2 to high end of leaf1.
1490 */
1491 xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
1492 ichdr1.count, count);
1493 }
1494
1495 xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
1496 xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
1497 xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
1498 xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
1499
1500 /*
1501 * Copy out last hashval in each block for B-tree code.
1502 */
1503 entries1 = xfs_attr3_leaf_entryp(leaf1);
1504 entries2 = xfs_attr3_leaf_entryp(leaf2);
1505 blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
1506 blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
1507
1508 /*
1509 * Adjust the expected index for insertion.
1510 * NOTE: this code depends on the (current) situation that the
1511 * second block was originally empty.
1512 *
1513 * If the insertion point moved to the 2nd block, we must adjust
1514 * the index. We must also track the entry just following the
1515 * new entry for use in an "atomic rename" operation, that entry
1516 * is always the "old" entry and the "new" entry is what we are
1517 * inserting. The index/blkno fields refer to the "old" entry,
1518 * while the index2/blkno2 fields refer to the "new" entry.
1519 */
1520 if (blk1->index > ichdr1.count) {
1521 ASSERT(state->inleaf == 0);
1522 blk2->index = blk1->index - ichdr1.count;
1523 args->index = args->index2 = blk2->index;
1524 args->blkno = args->blkno2 = blk2->blkno;
1525 } else if (blk1->index == ichdr1.count) {
1526 if (state->inleaf) {
1527 args->index = blk1->index;
1528 args->blkno = blk1->blkno;
1529 args->index2 = 0;
1530 args->blkno2 = blk2->blkno;
1531 } else {
1532 /*
1533 * On a double leaf split, the original attr location
1534 * is already stored in blkno2/index2, so don't
1535 * overwrite it overwise we corrupt the tree.
1536 */
1537 blk2->index = blk1->index - ichdr1.count;
1538 args->index = blk2->index;
1539 args->blkno = blk2->blkno;
1540 if (!state->extravalid) {
1541 /*
1542 * set the new attr location to match the old
1543 * one and let the higher level split code
1544 * decide where in the leaf to place it.
1545 */
1546 args->index2 = blk2->index;
1547 args->blkno2 = blk2->blkno;
1548 }
1549 }
1550 } else {
1551 ASSERT(state->inleaf == 1);
1552 args->index = args->index2 = blk1->index;
1553 args->blkno = args->blkno2 = blk1->blkno;
1554 }
1555}
1556
1557/*
1558 * Examine entries until we reduce the absolute difference in
1559 * byte usage between the two blocks to a minimum.
1560 * GROT: Is this really necessary? With other than a 512 byte blocksize,
1561 * GROT: there will always be enough room in either block for a new entry.
1562 * GROT: Do a double-split for this case?
1563 */
1564STATIC int
1565xfs_attr3_leaf_figure_balance(
1566 struct xfs_da_state *state,
1567 struct xfs_da_state_blk *blk1,
1568 struct xfs_attr3_icleaf_hdr *ichdr1,
1569 struct xfs_da_state_blk *blk2,
1570 struct xfs_attr3_icleaf_hdr *ichdr2,
1571 int *countarg,
1572 int *usedbytesarg)
1573{
1574 struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr;
1575 struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr;
1576 struct xfs_attr_leaf_entry *entry;
1577 int count;
1578 int max;
1579 int index;
1580 int totallen = 0;
1581 int half;
1582 int lastdelta;
1583 int foundit = 0;
1584 int tmp;
1585
1586 /*
1587 * Examine entries until we reduce the absolute difference in
1588 * byte usage between the two blocks to a minimum.
1589 */
1590 max = ichdr1->count + ichdr2->count;
1591 half = (max + 1) * sizeof(*entry);
1592 half += ichdr1->usedbytes + ichdr2->usedbytes +
1593 xfs_attr_leaf_newentsize(state->args, NULL);
1594 half /= 2;
1595 lastdelta = state->args->geo->blksize;
1596 entry = xfs_attr3_leaf_entryp(leaf1);
1597 for (count = index = 0; count < max; entry++, index++, count++) {
1598
1599#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
1600 /*
1601 * The new entry is in the first block, account for it.
1602 */
1603 if (count == blk1->index) {
1604 tmp = totallen + sizeof(*entry) +
1605 xfs_attr_leaf_newentsize(state->args, NULL);
1606 if (XFS_ATTR_ABS(half - tmp) > lastdelta)
1607 break;
1608 lastdelta = XFS_ATTR_ABS(half - tmp);
1609 totallen = tmp;
1610 foundit = 1;
1611 }
1612
1613 /*
1614 * Wrap around into the second block if necessary.
1615 */
1616 if (count == ichdr1->count) {
1617 leaf1 = leaf2;
1618 entry = xfs_attr3_leaf_entryp(leaf1);
1619 index = 0;
1620 }
1621
1622 /*
1623 * Figure out if next leaf entry would be too much.
1624 */
1625 tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
1626 index);
1627 if (XFS_ATTR_ABS(half - tmp) > lastdelta)
1628 break;
1629 lastdelta = XFS_ATTR_ABS(half - tmp);
1630 totallen = tmp;
1631#undef XFS_ATTR_ABS
1632 }
1633
1634 /*
1635 * Calculate the number of usedbytes that will end up in lower block.
1636 * If new entry not in lower block, fix up the count.
1637 */
1638 totallen -= count * sizeof(*entry);
1639 if (foundit) {
1640 totallen -= sizeof(*entry) +
1641 xfs_attr_leaf_newentsize(state->args, NULL);
1642 }
1643
1644 *countarg = count;
1645 *usedbytesarg = totallen;
1646 return foundit;
1647}
1648
1649/*========================================================================
1650 * Routines used for shrinking the Btree.
1651 *========================================================================*/
1652
1653/*
1654 * Check a leaf block and its neighbors to see if the block should be
1655 * collapsed into one or the other neighbor. Always keep the block
1656 * with the smaller block number.
1657 * If the current block is over 50% full, don't try to join it, return 0.
1658 * If the block is empty, fill in the state structure and return 2.
1659 * If it can be collapsed, fill in the state structure and return 1.
1660 * If nothing can be done, return 0.
1661 *
1662 * GROT: allow for INCOMPLETE entries in calculation.
1663 */
1664int
1665xfs_attr3_leaf_toosmall(
1666 struct xfs_da_state *state,
1667 int *action)
1668{
1669 struct xfs_attr_leafblock *leaf;
1670 struct xfs_da_state_blk *blk;
1671 struct xfs_attr3_icleaf_hdr ichdr;
1672 struct xfs_buf *bp;
1673 xfs_dablk_t blkno;
1674 int bytes;
1675 int forward;
1676 int error;
1677 int retval;
1678 int i;
1679
1680 trace_xfs_attr_leaf_toosmall(state->args);
1681
1682 /*
1683 * Check for the degenerate case of the block being over 50% full.
1684 * If so, it's not worth even looking to see if we might be able
1685 * to coalesce with a sibling.
1686 */
1687 blk = &state->path.blk[ state->path.active-1 ];
1688 leaf = blk->bp->b_addr;
1689 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
1690 bytes = xfs_attr3_leaf_hdr_size(leaf) +
1691 ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
1692 ichdr.usedbytes;
1693 if (bytes > (state->args->geo->blksize >> 1)) {
1694 *action = 0; /* blk over 50%, don't try to join */
1695 return 0;
1696 }
1697
1698 /*
1699 * Check for the degenerate case of the block being empty.
1700 * If the block is empty, we'll simply delete it, no need to
1701 * coalesce it with a sibling block. We choose (arbitrarily)
1702 * to merge with the forward block unless it is NULL.
1703 */
1704 if (ichdr.count == 0) {
1705 /*
1706 * Make altpath point to the block we want to keep and
1707 * path point to the block we want to drop (this one).
1708 */
1709 forward = (ichdr.forw != 0);
1710 memcpy(&state->altpath, &state->path, sizeof(state->path));
1711 error = xfs_da3_path_shift(state, &state->altpath, forward,
1712 0, &retval);
1713 if (error)
1714 return error;
1715 if (retval) {
1716 *action = 0;
1717 } else {
1718 *action = 2;
1719 }
1720 return 0;
1721 }
1722
1723 /*
1724 * Examine each sibling block to see if we can coalesce with
1725 * at least 25% free space to spare. We need to figure out
1726 * whether to merge with the forward or the backward block.
1727 * We prefer coalescing with the lower numbered sibling so as
1728 * to shrink an attribute list over time.
1729 */
1730 /* start with smaller blk num */
1731 forward = ichdr.forw < ichdr.back;
1732 for (i = 0; i < 2; forward = !forward, i++) {
1733 struct xfs_attr3_icleaf_hdr ichdr2;
1734 if (forward)
1735 blkno = ichdr.forw;
1736 else
1737 blkno = ichdr.back;
1738 if (blkno == 0)
1739 continue;
1740 error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
1741 blkno, -1, &bp);
1742 if (error)
1743 return error;
1744
1745 xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
1746
1747 bytes = state->args->geo->blksize -
1748 (state->args->geo->blksize >> 2) -
1749 ichdr.usedbytes - ichdr2.usedbytes -
1750 ((ichdr.count + ichdr2.count) *
1751 sizeof(xfs_attr_leaf_entry_t)) -
1752 xfs_attr3_leaf_hdr_size(leaf);
1753
1754 xfs_trans_brelse(state->args->trans, bp);
1755 if (bytes >= 0)
1756 break; /* fits with at least 25% to spare */
1757 }
1758 if (i >= 2) {
1759 *action = 0;
1760 return 0;
1761 }
1762
1763 /*
1764 * Make altpath point to the block we want to keep (the lower
1765 * numbered block) and path point to the block we want to drop.
1766 */
1767 memcpy(&state->altpath, &state->path, sizeof(state->path));
1768 if (blkno < blk->blkno) {
1769 error = xfs_da3_path_shift(state, &state->altpath, forward,
1770 0, &retval);
1771 } else {
1772 error = xfs_da3_path_shift(state, &state->path, forward,
1773 0, &retval);
1774 }
1775 if (error)
1776 return error;
1777 if (retval) {
1778 *action = 0;
1779 } else {
1780 *action = 1;
1781 }
1782 return 0;
1783}
1784
1785/*
1786 * Remove a name from the leaf attribute list structure.
1787 *
1788 * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
1789 * If two leaves are 37% full, when combined they will leave 25% free.
1790 */
1791int
1792xfs_attr3_leaf_remove(
1793 struct xfs_buf *bp,
1794 struct xfs_da_args *args)
1795{
1796 struct xfs_attr_leafblock *leaf;
1797 struct xfs_attr3_icleaf_hdr ichdr;
1798 struct xfs_attr_leaf_entry *entry;
1799 int before;
1800 int after;
1801 int smallest;
1802 int entsize;
1803 int tablesize;
1804 int tmp;
1805 int i;
1806
1807 trace_xfs_attr_leaf_remove(args);
1808
1809 leaf = bp->b_addr;
1810 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
1811
1812 ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
1813 ASSERT(args->index >= 0 && args->index < ichdr.count);
1814 ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
1815 xfs_attr3_leaf_hdr_size(leaf));
1816
1817 entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
1818
1819 ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
1820 ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
1821
1822 /*
1823 * Scan through free region table:
1824 * check for adjacency of free'd entry with an existing one,
1825 * find smallest free region in case we need to replace it,
1826 * adjust any map that borders the entry table,
1827 */
1828 tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
1829 + xfs_attr3_leaf_hdr_size(leaf);
1830 tmp = ichdr.freemap[0].size;
1831 before = after = -1;
1832 smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
1833 entsize = xfs_attr_leaf_entsize(leaf, args->index);
1834 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
1835 ASSERT(ichdr.freemap[i].base < args->geo->blksize);
1836 ASSERT(ichdr.freemap[i].size < args->geo->blksize);
1837 if (ichdr.freemap[i].base == tablesize) {
1838 ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
1839 ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
1840 }
1841
1842 if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
1843 be16_to_cpu(entry->nameidx)) {
1844 before = i;
1845 } else if (ichdr.freemap[i].base ==
1846 (be16_to_cpu(entry->nameidx) + entsize)) {
1847 after = i;
1848 } else if (ichdr.freemap[i].size < tmp) {
1849 tmp = ichdr.freemap[i].size;
1850 smallest = i;
1851 }
1852 }
1853
1854 /*
1855 * Coalesce adjacent freemap regions,
1856 * or replace the smallest region.
1857 */
1858 if ((before >= 0) || (after >= 0)) {
1859 if ((before >= 0) && (after >= 0)) {
1860 ichdr.freemap[before].size += entsize;
1861 ichdr.freemap[before].size += ichdr.freemap[after].size;
1862 ichdr.freemap[after].base = 0;
1863 ichdr.freemap[after].size = 0;
1864 } else if (before >= 0) {
1865 ichdr.freemap[before].size += entsize;
1866 } else {
1867 ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
1868 ichdr.freemap[after].size += entsize;
1869 }
1870 } else {
1871 /*
1872 * Replace smallest region (if it is smaller than free'd entry)
1873 */
1874 if (ichdr.freemap[smallest].size < entsize) {
1875 ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
1876 ichdr.freemap[smallest].size = entsize;
1877 }
1878 }
1879
1880 /*
1881 * Did we remove the first entry?
1882 */
1883 if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
1884 smallest = 1;
1885 else
1886 smallest = 0;
1887
1888 /*
1889 * Compress the remaining entries and zero out the removed stuff.
1890 */
1891 memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
1892 ichdr.usedbytes -= entsize;
1893 xfs_trans_log_buf(args->trans, bp,
1894 XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
1895 entsize));
1896
1897 tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
1898 memmove(entry, entry + 1, tmp);
1899 ichdr.count--;
1900 xfs_trans_log_buf(args->trans, bp,
1901 XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
1902
1903 entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
1904 memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
1905
1906 /*
1907 * If we removed the first entry, re-find the first used byte
1908 * in the name area. Note that if the entry was the "firstused",
1909 * then we don't have a "hole" in our block resulting from
1910 * removing the name.
1911 */
1912 if (smallest) {
1913 tmp = args->geo->blksize;
1914 entry = xfs_attr3_leaf_entryp(leaf);
1915 for (i = ichdr.count - 1; i >= 0; entry++, i--) {
1916 ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
1917 ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
1918
1919 if (be16_to_cpu(entry->nameidx) < tmp)
1920 tmp = be16_to_cpu(entry->nameidx);
1921 }
1922 ichdr.firstused = tmp;
1923 if (!ichdr.firstused)
1924 ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
1925 } else {
1926 ichdr.holes = 1; /* mark as needing compaction */
1927 }
1928 xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
1929 xfs_trans_log_buf(args->trans, bp,
1930 XFS_DA_LOGRANGE(leaf, &leaf->hdr,
1931 xfs_attr3_leaf_hdr_size(leaf)));
1932
1933 /*
1934 * Check if leaf is less than 50% full, caller may want to
1935 * "join" the leaf with a sibling if so.
1936 */
1937 tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
1938 ichdr.count * sizeof(xfs_attr_leaf_entry_t);
1939
1940 return tmp < args->geo->magicpct; /* leaf is < 37% full */
1941}
1942
1943/*
1944 * Move all the attribute list entries from drop_leaf into save_leaf.
1945 */
1946void
1947xfs_attr3_leaf_unbalance(
1948 struct xfs_da_state *state,
1949 struct xfs_da_state_blk *drop_blk,
1950 struct xfs_da_state_blk *save_blk)
1951{
1952 struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
1953 struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
1954 struct xfs_attr3_icleaf_hdr drophdr;
1955 struct xfs_attr3_icleaf_hdr savehdr;
1956 struct xfs_attr_leaf_entry *entry;
1957
1958 trace_xfs_attr_leaf_unbalance(state->args);
1959
1960 drop_leaf = drop_blk->bp->b_addr;
1961 save_leaf = save_blk->bp->b_addr;
1962 xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
1963 xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
1964 entry = xfs_attr3_leaf_entryp(drop_leaf);
1965
1966 /*
1967 * Save last hashval from dying block for later Btree fixup.
1968 */
1969 drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
1970
1971 /*
1972 * Check if we need a temp buffer, or can we do it in place.
1973 * Note that we don't check "leaf" for holes because we will
1974 * always be dropping it, toosmall() decided that for us already.
1975 */
1976 if (savehdr.holes == 0) {
1977 /*
1978 * dest leaf has no holes, so we add there. May need
1979 * to make some room in the entry array.
1980 */
1981 if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
1982 drop_blk->bp, &drophdr)) {
1983 xfs_attr3_leaf_moveents(state->args,
1984 drop_leaf, &drophdr, 0,
1985 save_leaf, &savehdr, 0,
1986 drophdr.count);
1987 } else {
1988 xfs_attr3_leaf_moveents(state->args,
1989 drop_leaf, &drophdr, 0,
1990 save_leaf, &savehdr,
1991 savehdr.count, drophdr.count);
1992 }
1993 } else {
1994 /*
1995 * Destination has holes, so we make a temporary copy
1996 * of the leaf and add them both to that.
1997 */
1998 struct xfs_attr_leafblock *tmp_leaf;
1999 struct xfs_attr3_icleaf_hdr tmphdr;
2000
2001 tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
2002
2003 /*
2004 * Copy the header into the temp leaf so that all the stuff
2005 * not in the incore header is present and gets copied back in
2006 * once we've moved all the entries.
2007 */
2008 memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
2009
2010 memset(&tmphdr, 0, sizeof(tmphdr));
2011 tmphdr.magic = savehdr.magic;
2012 tmphdr.forw = savehdr.forw;
2013 tmphdr.back = savehdr.back;
2014 tmphdr.firstused = state->args->geo->blksize;
2015
2016 /* write the header to the temp buffer to initialise it */
2017 xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
2018
2019 if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
2020 drop_blk->bp, &drophdr)) {
2021 xfs_attr3_leaf_moveents(state->args,
2022 drop_leaf, &drophdr, 0,
2023 tmp_leaf, &tmphdr, 0,
2024 drophdr.count);
2025 xfs_attr3_leaf_moveents(state->args,
2026 save_leaf, &savehdr, 0,
2027 tmp_leaf, &tmphdr, tmphdr.count,
2028 savehdr.count);
2029 } else {
2030 xfs_attr3_leaf_moveents(state->args,
2031 save_leaf, &savehdr, 0,
2032 tmp_leaf, &tmphdr, 0,
2033 savehdr.count);
2034 xfs_attr3_leaf_moveents(state->args,
2035 drop_leaf, &drophdr, 0,
2036 tmp_leaf, &tmphdr, tmphdr.count,
2037 drophdr.count);
2038 }
2039 memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
2040 savehdr = tmphdr; /* struct copy */
2041 kmem_free(tmp_leaf);
2042 }
2043
2044 xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
2045 xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
2046 state->args->geo->blksize - 1);
2047
2048 /*
2049 * Copy out last hashval in each block for B-tree code.
2050 */
2051 entry = xfs_attr3_leaf_entryp(save_leaf);
2052 save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
2053}
2054
2055/*========================================================================
2056 * Routines used for finding things in the Btree.
2057 *========================================================================*/
2058
2059/*
2060 * Look up a name in a leaf attribute list structure.
2061 * This is the internal routine, it uses the caller's buffer.
2062 *
2063 * Note that duplicate keys are allowed, but only check within the
2064 * current leaf node. The Btree code must check in adjacent leaf nodes.
2065 *
2066 * Return in args->index the index into the entry[] array of either
2067 * the found entry, or where the entry should have been (insert before
2068 * that entry).
2069 *
2070 * Don't change the args->value unless we find the attribute.
2071 */
2072int
2073xfs_attr3_leaf_lookup_int(
2074 struct xfs_buf *bp,
2075 struct xfs_da_args *args)
2076{
2077 struct xfs_attr_leafblock *leaf;
2078 struct xfs_attr3_icleaf_hdr ichdr;
2079 struct xfs_attr_leaf_entry *entry;
2080 struct xfs_attr_leaf_entry *entries;
2081 struct xfs_attr_leaf_name_local *name_loc;
2082 struct xfs_attr_leaf_name_remote *name_rmt;
2083 xfs_dahash_t hashval;
2084 int probe;
2085 int span;
2086
2087 trace_xfs_attr_leaf_lookup(args);
2088
2089 leaf = bp->b_addr;
2090 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
2091 entries = xfs_attr3_leaf_entryp(leaf);
2092 ASSERT(ichdr.count < args->geo->blksize / 8);
2093
2094 /*
2095 * Binary search. (note: small blocks will skip this loop)
2096 */
2097 hashval = args->hashval;
2098 probe = span = ichdr.count / 2;
2099 for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
2100 span /= 2;
2101 if (be32_to_cpu(entry->hashval) < hashval)
2102 probe += span;
2103 else if (be32_to_cpu(entry->hashval) > hashval)
2104 probe -= span;
2105 else
2106 break;
2107 }
2108 ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
2109 ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
2110
2111 /*
2112 * Since we may have duplicate hashval's, find the first matching
2113 * hashval in the leaf.
2114 */
2115 while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
2116 entry--;
2117 probe--;
2118 }
2119 while (probe < ichdr.count &&
2120 be32_to_cpu(entry->hashval) < hashval) {
2121 entry++;
2122 probe++;
2123 }
2124 if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
2125 args->index = probe;
2126 return -ENOATTR;
2127 }
2128
2129 /*
2130 * Duplicate keys may be present, so search all of them for a match.
2131 */
2132 for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
2133 entry++, probe++) {
2134/*
2135 * GROT: Add code to remove incomplete entries.
2136 */
2137 /*
2138 * If we are looking for INCOMPLETE entries, show only those.
2139 * If we are looking for complete entries, show only those.
2140 */
2141 if ((args->flags & XFS_ATTR_INCOMPLETE) !=
2142 (entry->flags & XFS_ATTR_INCOMPLETE)) {
2143 continue;
2144 }
2145 if (entry->flags & XFS_ATTR_LOCAL) {
2146 name_loc = xfs_attr3_leaf_name_local(leaf, probe);
2147 if (name_loc->namelen != args->namelen)
2148 continue;
2149 if (memcmp(args->name, name_loc->nameval,
2150 args->namelen) != 0)
2151 continue;
2152 if (!xfs_attr_namesp_match(args->flags, entry->flags))
2153 continue;
2154 args->index = probe;
2155 return -EEXIST;
2156 } else {
2157 name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
2158 if (name_rmt->namelen != args->namelen)
2159 continue;
2160 if (memcmp(args->name, name_rmt->name,
2161 args->namelen) != 0)
2162 continue;
2163 if (!xfs_attr_namesp_match(args->flags, entry->flags))
2164 continue;
2165 args->index = probe;
2166 args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
2167 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2168 args->rmtblkcnt = xfs_attr3_rmt_blocks(
2169 args->dp->i_mount,
2170 args->rmtvaluelen);
2171 return -EEXIST;
2172 }
2173 }
2174 args->index = probe;
2175 return -ENOATTR;
2176}
2177
2178/*
2179 * Get the value associated with an attribute name from a leaf attribute
2180 * list structure.
2181 */
2182int
2183xfs_attr3_leaf_getvalue(
2184 struct xfs_buf *bp,
2185 struct xfs_da_args *args)
2186{
2187 struct xfs_attr_leafblock *leaf;
2188 struct xfs_attr3_icleaf_hdr ichdr;
2189 struct xfs_attr_leaf_entry *entry;
2190 struct xfs_attr_leaf_name_local *name_loc;
2191 struct xfs_attr_leaf_name_remote *name_rmt;
2192 int valuelen;
2193
2194 leaf = bp->b_addr;
2195 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
2196 ASSERT(ichdr.count < args->geo->blksize / 8);
2197 ASSERT(args->index < ichdr.count);
2198
2199 entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
2200 if (entry->flags & XFS_ATTR_LOCAL) {
2201 name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
2202 ASSERT(name_loc->namelen == args->namelen);
2203 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
2204 valuelen = be16_to_cpu(name_loc->valuelen);
2205 if (args->flags & ATTR_KERNOVAL) {
2206 args->valuelen = valuelen;
2207 return 0;
2208 }
2209 if (args->valuelen < valuelen) {
2210 args->valuelen = valuelen;
2211 return -ERANGE;
2212 }
2213 args->valuelen = valuelen;
2214 memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
2215 } else {
2216 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
2217 ASSERT(name_rmt->namelen == args->namelen);
2218 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
2219 args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
2220 args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
2221 args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
2222 args->rmtvaluelen);
2223 if (args->flags & ATTR_KERNOVAL) {
2224 args->valuelen = args->rmtvaluelen;
2225 return 0;
2226 }
2227 if (args->valuelen < args->rmtvaluelen) {
2228 args->valuelen = args->rmtvaluelen;
2229 return -ERANGE;
2230 }
2231 args->valuelen = args->rmtvaluelen;
2232 }
2233 return 0;
2234}
2235
2236/*========================================================================
2237 * Utility routines.
2238 *========================================================================*/
2239
2240/*
2241 * Move the indicated entries from one leaf to another.
2242 * NOTE: this routine modifies both source and destination leaves.
2243 */
2244/*ARGSUSED*/
2245STATIC void
2246xfs_attr3_leaf_moveents(
2247 struct xfs_da_args *args,
2248 struct xfs_attr_leafblock *leaf_s,
2249 struct xfs_attr3_icleaf_hdr *ichdr_s,
2250 int start_s,
2251 struct xfs_attr_leafblock *leaf_d,
2252 struct xfs_attr3_icleaf_hdr *ichdr_d,
2253 int start_d,
2254 int count)
2255{
2256 struct xfs_attr_leaf_entry *entry_s;
2257 struct xfs_attr_leaf_entry *entry_d;
2258 int desti;
2259 int tmp;
2260 int i;
2261
2262 /*
2263 * Check for nothing to do.
2264 */
2265 if (count == 0)
2266 return;
2267
2268 /*
2269 * Set up environment.
2270 */
2271 ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
2272 ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
2273 ASSERT(ichdr_s->magic == ichdr_d->magic);
2274 ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
2275 ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
2276 + xfs_attr3_leaf_hdr_size(leaf_s));
2277 ASSERT(ichdr_d->count < args->geo->blksize / 8);
2278 ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
2279 + xfs_attr3_leaf_hdr_size(leaf_d));
2280
2281 ASSERT(start_s < ichdr_s->count);
2282 ASSERT(start_d <= ichdr_d->count);
2283 ASSERT(count <= ichdr_s->count);
2284
2285
2286 /*
2287 * Move the entries in the destination leaf up to make a hole?
2288 */
2289 if (start_d < ichdr_d->count) {
2290 tmp = ichdr_d->count - start_d;
2291 tmp *= sizeof(xfs_attr_leaf_entry_t);
2292 entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
2293 entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
2294 memmove(entry_d, entry_s, tmp);
2295 }
2296
2297 /*
2298 * Copy all entry's in the same (sorted) order,
2299 * but allocate attribute info packed and in sequence.
2300 */
2301 entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
2302 entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
2303 desti = start_d;
2304 for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
2305 ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
2306 tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
2307#ifdef GROT
2308 /*
2309 * Code to drop INCOMPLETE entries. Difficult to use as we
2310 * may also need to change the insertion index. Code turned
2311 * off for 6.2, should be revisited later.
2312 */
2313 if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
2314 memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
2315 ichdr_s->usedbytes -= tmp;
2316 ichdr_s->count -= 1;
2317 entry_d--; /* to compensate for ++ in loop hdr */
2318 desti--;
2319 if ((start_s + i) < offset)
2320 result++; /* insertion index adjustment */
2321 } else {
2322#endif /* GROT */
2323 ichdr_d->firstused -= tmp;
2324 /* both on-disk, don't endian flip twice */
2325 entry_d->hashval = entry_s->hashval;
2326 entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
2327 entry_d->flags = entry_s->flags;
2328 ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
2329 <= args->geo->blksize);
2330 memmove(xfs_attr3_leaf_name(leaf_d, desti),
2331 xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
2332 ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
2333 <= args->geo->blksize);
2334 memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
2335 ichdr_s->usedbytes -= tmp;
2336 ichdr_d->usedbytes += tmp;
2337 ichdr_s->count -= 1;
2338 ichdr_d->count += 1;
2339 tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
2340 + xfs_attr3_leaf_hdr_size(leaf_d);
2341 ASSERT(ichdr_d->firstused >= tmp);
2342#ifdef GROT
2343 }
2344#endif /* GROT */
2345 }
2346
2347 /*
2348 * Zero out the entries we just copied.
2349 */
2350 if (start_s == ichdr_s->count) {
2351 tmp = count * sizeof(xfs_attr_leaf_entry_t);
2352 entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
2353 ASSERT(((char *)entry_s + tmp) <=
2354 ((char *)leaf_s + args->geo->blksize));
2355 memset(entry_s, 0, tmp);
2356 } else {
2357 /*
2358 * Move the remaining entries down to fill the hole,
2359 * then zero the entries at the top.
2360 */
2361 tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
2362 entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
2363 entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
2364 memmove(entry_d, entry_s, tmp);
2365
2366 tmp = count * sizeof(xfs_attr_leaf_entry_t);
2367 entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
2368 ASSERT(((char *)entry_s + tmp) <=
2369 ((char *)leaf_s + args->geo->blksize));
2370 memset(entry_s, 0, tmp);
2371 }
2372
2373 /*
2374 * Fill in the freemap information
2375 */
2376 ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
2377 ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
2378 ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
2379 ichdr_d->freemap[1].base = 0;
2380 ichdr_d->freemap[2].base = 0;
2381 ichdr_d->freemap[1].size = 0;
2382 ichdr_d->freemap[2].size = 0;
2383 ichdr_s->holes = 1; /* leaf may not be compact */
2384}
2385
2386/*
2387 * Pick up the last hashvalue from a leaf block.
2388 */
2389xfs_dahash_t
2390xfs_attr_leaf_lasthash(
2391 struct xfs_buf *bp,
2392 int *count)
2393{
2394 struct xfs_attr3_icleaf_hdr ichdr;
2395 struct xfs_attr_leaf_entry *entries;
2396
2397 xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
2398 entries = xfs_attr3_leaf_entryp(bp->b_addr);
2399 if (count)
2400 *count = ichdr.count;
2401 if (!ichdr.count)
2402 return 0;
2403 return be32_to_cpu(entries[ichdr.count - 1].hashval);
2404}
2405
2406/*
2407 * Calculate the number of bytes used to store the indicated attribute
2408 * (whether local or remote only calculate bytes in this block).
2409 */
2410STATIC int
2411xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
2412{
2413 struct xfs_attr_leaf_entry *entries;
2414 xfs_attr_leaf_name_local_t *name_loc;
2415 xfs_attr_leaf_name_remote_t *name_rmt;
2416 int size;
2417
2418 entries = xfs_attr3_leaf_entryp(leaf);
2419 if (entries[index].flags & XFS_ATTR_LOCAL) {
2420 name_loc = xfs_attr3_leaf_name_local(leaf, index);
2421 size = xfs_attr_leaf_entsize_local(name_loc->namelen,
2422 be16_to_cpu(name_loc->valuelen));
2423 } else {
2424 name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
2425 size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
2426 }
2427 return size;
2428}
2429
2430/*
2431 * Calculate the number of bytes that would be required to store the new
2432 * attribute (whether local or remote only calculate bytes in this block).
2433 * This routine decides as a side effect whether the attribute will be
2434 * a "local" or a "remote" attribute.
2435 */
2436int
2437xfs_attr_leaf_newentsize(
2438 struct xfs_da_args *args,
2439 int *local)
2440{
2441 int size;
2442
2443 size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
2444 if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
2445 if (local)
2446 *local = 1;
2447 return size;
2448 }
2449 if (local)
2450 *local = 0;
2451 return xfs_attr_leaf_entsize_remote(args->namelen);
2452}
2453
2454
2455/*========================================================================
2456 * Manage the INCOMPLETE flag in a leaf entry
2457 *========================================================================*/
2458
2459/*
2460 * Clear the INCOMPLETE flag on an entry in a leaf block.
2461 */
2462int
2463xfs_attr3_leaf_clearflag(
2464 struct xfs_da_args *args)
2465{
2466 struct xfs_attr_leafblock *leaf;
2467 struct xfs_attr_leaf_entry *entry;
2468 struct xfs_attr_leaf_name_remote *name_rmt;
2469 struct xfs_buf *bp;
2470 int error;
2471#ifdef DEBUG
2472 struct xfs_attr3_icleaf_hdr ichdr;
2473 xfs_attr_leaf_name_local_t *name_loc;
2474 int namelen;
2475 char *name;
2476#endif /* DEBUG */
2477
2478 trace_xfs_attr_leaf_clearflag(args);
2479 /*
2480 * Set up the operation.
2481 */
2482 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2483 if (error)
2484 return error;
2485
2486 leaf = bp->b_addr;
2487 entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
2488 ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
2489
2490#ifdef DEBUG
2491 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
2492 ASSERT(args->index < ichdr.count);
2493 ASSERT(args->index >= 0);
2494
2495 if (entry->flags & XFS_ATTR_LOCAL) {
2496 name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
2497 namelen = name_loc->namelen;
2498 name = (char *)name_loc->nameval;
2499 } else {
2500 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
2501 namelen = name_rmt->namelen;
2502 name = (char *)name_rmt->name;
2503 }
2504 ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
2505 ASSERT(namelen == args->namelen);
2506 ASSERT(memcmp(name, args->name, namelen) == 0);
2507#endif /* DEBUG */
2508
2509 entry->flags &= ~XFS_ATTR_INCOMPLETE;
2510 xfs_trans_log_buf(args->trans, bp,
2511 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
2512
2513 if (args->rmtblkno) {
2514 ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
2515 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
2516 name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
2517 name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
2518 xfs_trans_log_buf(args->trans, bp,
2519 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
2520 }
2521
2522 /*
2523 * Commit the flag value change and start the next trans in series.
2524 */
2525 return xfs_trans_roll(&args->trans, args->dp);
2526}
2527
2528/*
2529 * Set the INCOMPLETE flag on an entry in a leaf block.
2530 */
2531int
2532xfs_attr3_leaf_setflag(
2533 struct xfs_da_args *args)
2534{
2535 struct xfs_attr_leafblock *leaf;
2536 struct xfs_attr_leaf_entry *entry;
2537 struct xfs_attr_leaf_name_remote *name_rmt;
2538 struct xfs_buf *bp;
2539 int error;
2540#ifdef DEBUG
2541 struct xfs_attr3_icleaf_hdr ichdr;
2542#endif
2543
2544 trace_xfs_attr_leaf_setflag(args);
2545
2546 /*
2547 * Set up the operation.
2548 */
2549 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2550 if (error)
2551 return error;
2552
2553 leaf = bp->b_addr;
2554#ifdef DEBUG
2555 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
2556 ASSERT(args->index < ichdr.count);
2557 ASSERT(args->index >= 0);
2558#endif
2559 entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
2560
2561 ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
2562 entry->flags |= XFS_ATTR_INCOMPLETE;
2563 xfs_trans_log_buf(args->trans, bp,
2564 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
2565 if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
2566 name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
2567 name_rmt->valueblk = 0;
2568 name_rmt->valuelen = 0;
2569 xfs_trans_log_buf(args->trans, bp,
2570 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
2571 }
2572
2573 /*
2574 * Commit the flag value change and start the next trans in series.
2575 */
2576 return xfs_trans_roll(&args->trans, args->dp);
2577}
2578
2579/*
2580 * In a single transaction, clear the INCOMPLETE flag on the leaf entry
2581 * given by args->blkno/index and set the INCOMPLETE flag on the leaf
2582 * entry given by args->blkno2/index2.
2583 *
2584 * Note that they could be in different blocks, or in the same block.
2585 */
2586int
2587xfs_attr3_leaf_flipflags(
2588 struct xfs_da_args *args)
2589{
2590 struct xfs_attr_leafblock *leaf1;
2591 struct xfs_attr_leafblock *leaf2;
2592 struct xfs_attr_leaf_entry *entry1;
2593 struct xfs_attr_leaf_entry *entry2;
2594 struct xfs_attr_leaf_name_remote *name_rmt;
2595 struct xfs_buf *bp1;
2596 struct xfs_buf *bp2;
2597 int error;
2598#ifdef DEBUG
2599 struct xfs_attr3_icleaf_hdr ichdr1;
2600 struct xfs_attr3_icleaf_hdr ichdr2;
2601 xfs_attr_leaf_name_local_t *name_loc;
2602 int namelen1, namelen2;
2603 char *name1, *name2;
2604#endif /* DEBUG */
2605
2606 trace_xfs_attr_leaf_flipflags(args);
2607
2608 /*
2609 * Read the block containing the "old" attr
2610 */
2611 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
2612 if (error)
2613 return error;
2614
2615 /*
2616 * Read the block containing the "new" attr, if it is different
2617 */
2618 if (args->blkno2 != args->blkno) {
2619 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
2620 -1, &bp2);
2621 if (error)
2622 return error;
2623 } else {
2624 bp2 = bp1;
2625 }
2626
2627 leaf1 = bp1->b_addr;
2628 entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
2629
2630 leaf2 = bp2->b_addr;
2631 entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
2632
2633#ifdef DEBUG
2634 xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
2635 ASSERT(args->index < ichdr1.count);
2636 ASSERT(args->index >= 0);
2637
2638 xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
2639 ASSERT(args->index2 < ichdr2.count);
2640 ASSERT(args->index2 >= 0);
2641
2642 if (entry1->flags & XFS_ATTR_LOCAL) {
2643 name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
2644 namelen1 = name_loc->namelen;
2645 name1 = (char *)name_loc->nameval;
2646 } else {
2647 name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
2648 namelen1 = name_rmt->namelen;
2649 name1 = (char *)name_rmt->name;
2650 }
2651 if (entry2->flags & XFS_ATTR_LOCAL) {
2652 name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
2653 namelen2 = name_loc->namelen;
2654 name2 = (char *)name_loc->nameval;
2655 } else {
2656 name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
2657 namelen2 = name_rmt->namelen;
2658 name2 = (char *)name_rmt->name;
2659 }
2660 ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
2661 ASSERT(namelen1 == namelen2);
2662 ASSERT(memcmp(name1, name2, namelen1) == 0);
2663#endif /* DEBUG */
2664
2665 ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
2666 ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
2667
2668 entry1->flags &= ~XFS_ATTR_INCOMPLETE;
2669 xfs_trans_log_buf(args->trans, bp1,
2670 XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
2671 if (args->rmtblkno) {
2672 ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
2673 name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
2674 name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
2675 name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
2676 xfs_trans_log_buf(args->trans, bp1,
2677 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
2678 }
2679
2680 entry2->flags |= XFS_ATTR_INCOMPLETE;
2681 xfs_trans_log_buf(args->trans, bp2,
2682 XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
2683 if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
2684 name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
2685 name_rmt->valueblk = 0;
2686 name_rmt->valuelen = 0;
2687 xfs_trans_log_buf(args->trans, bp2,
2688 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
2689 }
2690
2691 /*
2692 * Commit the flag value change and start the next trans in series.
2693 */
2694 error = xfs_trans_roll(&args->trans, args->dp);
2695
2696 return error;
2697}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
new file mode 100644
index 000000000000..e2929da7c3ba
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -0,0 +1,108 @@
1/*
2 * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#ifndef __XFS_ATTR_LEAF_H__
20#define __XFS_ATTR_LEAF_H__
21
22struct attrlist;
23struct attrlist_cursor_kern;
24struct xfs_attr_list_context;
25struct xfs_da_args;
26struct xfs_da_state;
27struct xfs_da_state_blk;
28struct xfs_inode;
29struct xfs_trans;
30
31/*
32 * Used to keep a list of "remote value" extents when unlinking an inode.
33 */
34typedef struct xfs_attr_inactive_list {
35 xfs_dablk_t valueblk; /* block number of value bytes */
36 int valuelen; /* number of bytes in value */
37} xfs_attr_inactive_list_t;
38
39
40/*========================================================================
41 * Function prototypes for the kernel.
42 *========================================================================*/
43
44/*
45 * Internal routines when attribute fork size < XFS_LITINO(mp).
46 */
47void xfs_attr_shortform_create(struct xfs_da_args *args);
48void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
49int xfs_attr_shortform_lookup(struct xfs_da_args *args);
50int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
51int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
52int xfs_attr_shortform_remove(struct xfs_da_args *args);
53int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
54int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
55int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
56
57
58/*
59 * Internal routines when attribute fork size == XFS_LBSIZE(mp).
60 */
61int xfs_attr3_leaf_to_node(struct xfs_da_args *args);
62int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
63 struct xfs_da_args *args, int forkoff);
64int xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
65int xfs_attr3_leaf_setflag(struct xfs_da_args *args);
66int xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
67
68/*
69 * Routines used for growing the Btree.
70 */
71int xfs_attr3_leaf_split(struct xfs_da_state *state,
72 struct xfs_da_state_blk *oldblk,
73 struct xfs_da_state_blk *newblk);
74int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
75 struct xfs_da_args *args);
76int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
77int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
78 struct xfs_da_args *args);
79int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
80 struct xfs_da_args *args);
81int xfs_attr3_leaf_list_int(struct xfs_buf *bp,
82 struct xfs_attr_list_context *context);
83
84/*
85 * Routines used for shrinking the Btree.
86 */
87int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
88void xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
89 struct xfs_da_state_blk *drop_blk,
90 struct xfs_da_state_blk *save_blk);
91int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
92
93/*
94 * Utility routines.
95 */
96xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
97int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
98 struct xfs_buf *leaf2_bp);
99int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
100int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
101 xfs_dablk_t bno, xfs_daddr_t mappedbno,
102 struct xfs_buf **bpp);
103void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
104 struct xfs_attr_leafblock *from);
105void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
106 struct xfs_attr3_icleaf_hdr *from);
107
108#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
new file mode 100644
index 000000000000..7510ab8058a4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -0,0 +1,628 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_shared.h"
22#include "xfs_format.h"
23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h"
25#include "xfs_bit.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_da_format.h"
30#include "xfs_da_btree.h"
31#include "xfs_inode.h"
32#include "xfs_alloc.h"
33#include "xfs_trans.h"
34#include "xfs_inode_item.h"
35#include "xfs_bmap.h"
36#include "xfs_bmap_util.h"
37#include "xfs_attr.h"
38#include "xfs_attr_leaf.h"
39#include "xfs_attr_remote.h"
40#include "xfs_trans_space.h"
41#include "xfs_trace.h"
42#include "xfs_cksum.h"
43#include "xfs_buf_item.h"
44#include "xfs_error.h"
45
46#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
47
48/*
49 * Each contiguous block has a header, so it is not just a simple attribute
50 * length to FSB conversion.
51 */
52int
53xfs_attr3_rmt_blocks(
54 struct xfs_mount *mp,
55 int attrlen)
56{
57 if (xfs_sb_version_hascrc(&mp->m_sb)) {
58 int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
59 return (attrlen + buflen - 1) / buflen;
60 }
61 return XFS_B_TO_FSB(mp, attrlen);
62}
63
64/*
65 * Checking of the remote attribute header is split into two parts. The verifier
66 * does CRC, location and bounds checking, the unpacking function checks the
67 * attribute parameters and owner.
68 */
69static bool
70xfs_attr3_rmt_hdr_ok(
71 void *ptr,
72 xfs_ino_t ino,
73 uint32_t offset,
74 uint32_t size,
75 xfs_daddr_t bno)
76{
77 struct xfs_attr3_rmt_hdr *rmt = ptr;
78
79 if (bno != be64_to_cpu(rmt->rm_blkno))
80 return false;
81 if (offset != be32_to_cpu(rmt->rm_offset))
82 return false;
83 if (size != be32_to_cpu(rmt->rm_bytes))
84 return false;
85 if (ino != be64_to_cpu(rmt->rm_owner))
86 return false;
87
88 /* ok */
89 return true;
90}
91
92static bool
93xfs_attr3_rmt_verify(
94 struct xfs_mount *mp,
95 void *ptr,
96 int fsbsize,
97 xfs_daddr_t bno)
98{
99 struct xfs_attr3_rmt_hdr *rmt = ptr;
100
101 if (!xfs_sb_version_hascrc(&mp->m_sb))
102 return false;
103 if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
104 return false;
105 if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
106 return false;
107 if (be64_to_cpu(rmt->rm_blkno) != bno)
108 return false;
109 if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
110 return false;
111 if (be32_to_cpu(rmt->rm_offset) +
112 be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
113 return false;
114 if (rmt->rm_owner == 0)
115 return false;
116
117 return true;
118}
119
120static void
121xfs_attr3_rmt_read_verify(
122 struct xfs_buf *bp)
123{
124 struct xfs_mount *mp = bp->b_target->bt_mount;
125 char *ptr;
126 int len;
127 xfs_daddr_t bno;
128 int blksize = mp->m_attr_geo->blksize;
129
130 /* no verification of non-crc buffers */
131 if (!xfs_sb_version_hascrc(&mp->m_sb))
132 return;
133
134 ptr = bp->b_addr;
135 bno = bp->b_bn;
136 len = BBTOB(bp->b_length);
137 ASSERT(len >= blksize);
138
139 while (len > 0) {
140 if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
141 xfs_buf_ioerror(bp, -EFSBADCRC);
142 break;
143 }
144 if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
145 xfs_buf_ioerror(bp, -EFSCORRUPTED);
146 break;
147 }
148 len -= blksize;
149 ptr += blksize;
150 bno += BTOBB(blksize);
151 }
152
153 if (bp->b_error)
154 xfs_verifier_error(bp);
155 else
156 ASSERT(len == 0);
157}
158
159static void
160xfs_attr3_rmt_write_verify(
161 struct xfs_buf *bp)
162{
163 struct xfs_mount *mp = bp->b_target->bt_mount;
164 struct xfs_buf_log_item *bip = bp->b_fspriv;
165 char *ptr;
166 int len;
167 xfs_daddr_t bno;
168 int blksize = mp->m_attr_geo->blksize;
169
170 /* no verification of non-crc buffers */
171 if (!xfs_sb_version_hascrc(&mp->m_sb))
172 return;
173
174 ptr = bp->b_addr;
175 bno = bp->b_bn;
176 len = BBTOB(bp->b_length);
177 ASSERT(len >= blksize);
178
179 while (len > 0) {
180 if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
181 xfs_buf_ioerror(bp, -EFSCORRUPTED);
182 xfs_verifier_error(bp);
183 return;
184 }
185 if (bip) {
186 struct xfs_attr3_rmt_hdr *rmt;
187
188 rmt = (struct xfs_attr3_rmt_hdr *)ptr;
189 rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
190 }
191 xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
192
193 len -= blksize;
194 ptr += blksize;
195 bno += BTOBB(blksize);
196 }
197 ASSERT(len == 0);
198}
199
200const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
201 .verify_read = xfs_attr3_rmt_read_verify,
202 .verify_write = xfs_attr3_rmt_write_verify,
203};
204
205STATIC int
206xfs_attr3_rmt_hdr_set(
207 struct xfs_mount *mp,
208 void *ptr,
209 xfs_ino_t ino,
210 uint32_t offset,
211 uint32_t size,
212 xfs_daddr_t bno)
213{
214 struct xfs_attr3_rmt_hdr *rmt = ptr;
215
216 if (!xfs_sb_version_hascrc(&mp->m_sb))
217 return 0;
218
219 rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
220 rmt->rm_offset = cpu_to_be32(offset);
221 rmt->rm_bytes = cpu_to_be32(size);
222 uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
223 rmt->rm_owner = cpu_to_be64(ino);
224 rmt->rm_blkno = cpu_to_be64(bno);
225
226 return sizeof(struct xfs_attr3_rmt_hdr);
227}
228
229/*
230 * Helper functions to copy attribute data in and out of the one disk extents
231 */
232STATIC int
233xfs_attr_rmtval_copyout(
234 struct xfs_mount *mp,
235 struct xfs_buf *bp,
236 xfs_ino_t ino,
237 int *offset,
238 int *valuelen,
239 __uint8_t **dst)
240{
241 char *src = bp->b_addr;
242 xfs_daddr_t bno = bp->b_bn;
243 int len = BBTOB(bp->b_length);
244 int blksize = mp->m_attr_geo->blksize;
245
246 ASSERT(len >= blksize);
247
248 while (len > 0 && *valuelen > 0) {
249 int hdr_size = 0;
250 int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
251
252 byte_cnt = min(*valuelen, byte_cnt);
253
254 if (xfs_sb_version_hascrc(&mp->m_sb)) {
255 if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
256 byte_cnt, bno)) {
257 xfs_alert(mp,
258"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
259 bno, *offset, byte_cnt, ino);
260 return -EFSCORRUPTED;
261 }
262 hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
263 }
264
265 memcpy(*dst, src + hdr_size, byte_cnt);
266
267 /* roll buffer forwards */
268 len -= blksize;
269 src += blksize;
270 bno += BTOBB(blksize);
271
272 /* roll attribute data forwards */
273 *valuelen -= byte_cnt;
274 *dst += byte_cnt;
275 *offset += byte_cnt;
276 }
277 return 0;
278}
279
280STATIC void
281xfs_attr_rmtval_copyin(
282 struct xfs_mount *mp,
283 struct xfs_buf *bp,
284 xfs_ino_t ino,
285 int *offset,
286 int *valuelen,
287 __uint8_t **src)
288{
289 char *dst = bp->b_addr;
290 xfs_daddr_t bno = bp->b_bn;
291 int len = BBTOB(bp->b_length);
292 int blksize = mp->m_attr_geo->blksize;
293
294 ASSERT(len >= blksize);
295
296 while (len > 0 && *valuelen > 0) {
297 int hdr_size;
298 int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
299
300 byte_cnt = min(*valuelen, byte_cnt);
301 hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
302 byte_cnt, bno);
303
304 memcpy(dst + hdr_size, *src, byte_cnt);
305
306 /*
307 * If this is the last block, zero the remainder of it.
308 * Check that we are actually the last block, too.
309 */
310 if (byte_cnt + hdr_size < blksize) {
311 ASSERT(*valuelen - byte_cnt == 0);
312 ASSERT(len == blksize);
313 memset(dst + hdr_size + byte_cnt, 0,
314 blksize - hdr_size - byte_cnt);
315 }
316
317 /* roll buffer forwards */
318 len -= blksize;
319 dst += blksize;
320 bno += BTOBB(blksize);
321
322 /* roll attribute data forwards */
323 *valuelen -= byte_cnt;
324 *src += byte_cnt;
325 *offset += byte_cnt;
326 }
327}
328
329/*
330 * Read the value associated with an attribute from the out-of-line buffer
331 * that we stored it in.
332 */
333int
334xfs_attr_rmtval_get(
335 struct xfs_da_args *args)
336{
337 struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE];
338 struct xfs_mount *mp = args->dp->i_mount;
339 struct xfs_buf *bp;
340 xfs_dablk_t lblkno = args->rmtblkno;
341 __uint8_t *dst = args->value;
342 int valuelen;
343 int nmap;
344 int error;
345 int blkcnt = args->rmtblkcnt;
346 int i;
347 int offset = 0;
348
349 trace_xfs_attr_rmtval_get(args);
350
351 ASSERT(!(args->flags & ATTR_KERNOVAL));
352 ASSERT(args->rmtvaluelen == args->valuelen);
353
354 valuelen = args->rmtvaluelen;
355 while (valuelen > 0) {
356 nmap = ATTR_RMTVALUE_MAPSIZE;
357 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
358 blkcnt, map, &nmap,
359 XFS_BMAPI_ATTRFORK);
360 if (error)
361 return error;
362 ASSERT(nmap >= 1);
363
364 for (i = 0; (i < nmap) && (valuelen > 0); i++) {
365 xfs_daddr_t dblkno;
366 int dblkcnt;
367
368 ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
369 (map[i].br_startblock != HOLESTARTBLOCK));
370 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
371 dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
372 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
373 dblkno, dblkcnt, 0, &bp,
374 &xfs_attr3_rmt_buf_ops);
375 if (error)
376 return error;
377
378 error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
379 &offset, &valuelen,
380 &dst);
381 xfs_buf_relse(bp);
382 if (error)
383 return error;
384
385 /* roll attribute extent map forwards */
386 lblkno += map[i].br_blockcount;
387 blkcnt -= map[i].br_blockcount;
388 }
389 }
390 ASSERT(valuelen == 0);
391 return 0;
392}
393
394/*
395 * Write the value associated with an attribute into the out-of-line buffer
396 * that we have defined for it.
397 */
398int
399xfs_attr_rmtval_set(
400 struct xfs_da_args *args)
401{
402 struct xfs_inode *dp = args->dp;
403 struct xfs_mount *mp = dp->i_mount;
404 struct xfs_bmbt_irec map;
405 xfs_dablk_t lblkno;
406 xfs_fileoff_t lfileoff = 0;
407 __uint8_t *src = args->value;
408 int blkcnt;
409 int valuelen;
410 int nmap;
411 int error;
412 int offset = 0;
413
414 trace_xfs_attr_rmtval_set(args);
415
416 /*
417 * Find a "hole" in the attribute address space large enough for
418 * us to drop the new attribute's value into. Because CRC enable
419 * attributes have headers, we can't just do a straight byte to FSB
420 * conversion and have to take the header space into account.
421 */
422 blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
423 error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
424 XFS_ATTR_FORK);
425 if (error)
426 return error;
427
428 args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
429 args->rmtblkcnt = blkcnt;
430
431 /*
432 * Roll through the "value", allocating blocks on disk as required.
433 */
434 while (blkcnt > 0) {
435 int committed;
436
437 /*
438 * Allocate a single extent, up to the size of the value.
439 */
440 xfs_bmap_init(args->flist, args->firstblock);
441 nmap = 1;
442 error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
443 blkcnt,
444 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
445 args->firstblock, args->total, &map, &nmap,
446 args->flist);
447 if (!error) {
448 error = xfs_bmap_finish(&args->trans, args->flist,
449 &committed);
450 }
451 if (error) {
452 ASSERT(committed);
453 args->trans = NULL;
454 xfs_bmap_cancel(args->flist);
455 return error;
456 }
457
458 /*
459 * bmap_finish() may have committed the last trans and started
460 * a new one. We need the inode to be in all transactions.
461 */
462 if (committed)
463 xfs_trans_ijoin(args->trans, dp, 0);
464
465 ASSERT(nmap == 1);
466 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
467 (map.br_startblock != HOLESTARTBLOCK));
468 lblkno += map.br_blockcount;
469 blkcnt -= map.br_blockcount;
470
471 /*
472 * Start the next trans in the chain.
473 */
474 error = xfs_trans_roll(&args->trans, dp);
475 if (error)
476 return error;
477 }
478
479 /*
480 * Roll through the "value", copying the attribute value to the
481 * already-allocated blocks. Blocks are written synchronously
482 * so that we can know they are all on disk before we turn off
483 * the INCOMPLETE flag.
484 */
485 lblkno = args->rmtblkno;
486 blkcnt = args->rmtblkcnt;
487 valuelen = args->rmtvaluelen;
488 while (valuelen > 0) {
489 struct xfs_buf *bp;
490 xfs_daddr_t dblkno;
491 int dblkcnt;
492
493 ASSERT(blkcnt > 0);
494
495 xfs_bmap_init(args->flist, args->firstblock);
496 nmap = 1;
497 error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
498 blkcnt, &map, &nmap,
499 XFS_BMAPI_ATTRFORK);
500 if (error)
501 return error;
502 ASSERT(nmap == 1);
503 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
504 (map.br_startblock != HOLESTARTBLOCK));
505
506 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
507 dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
508
509 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
510 if (!bp)
511 return -ENOMEM;
512 bp->b_ops = &xfs_attr3_rmt_buf_ops;
513
514 xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
515 &valuelen, &src);
516
517 error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
518 xfs_buf_relse(bp);
519 if (error)
520 return error;
521
522
523 /* roll attribute extent map forwards */
524 lblkno += map.br_blockcount;
525 blkcnt -= map.br_blockcount;
526 }
527 ASSERT(valuelen == 0);
528 return 0;
529}
530
531/*
532 * Remove the value associated with an attribute by deleting the
533 * out-of-line buffer that it is stored on.
534 */
535int
536xfs_attr_rmtval_remove(
537 struct xfs_da_args *args)
538{
539 struct xfs_mount *mp = args->dp->i_mount;
540 xfs_dablk_t lblkno;
541 int blkcnt;
542 int error;
543 int done;
544
545 trace_xfs_attr_rmtval_remove(args);
546
547 /*
548 * Roll through the "value", invalidating the attribute value's blocks.
549 */
550 lblkno = args->rmtblkno;
551 blkcnt = args->rmtblkcnt;
552 while (blkcnt > 0) {
553 struct xfs_bmbt_irec map;
554 struct xfs_buf *bp;
555 xfs_daddr_t dblkno;
556 int dblkcnt;
557 int nmap;
558
559 /*
560 * Try to remember where we decided to put the value.
561 */
562 nmap = 1;
563 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
564 blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
565 if (error)
566 return error;
567 ASSERT(nmap == 1);
568 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
569 (map.br_startblock != HOLESTARTBLOCK));
570
571 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
572 dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
573
574 /*
575 * If the "remote" value is in the cache, remove it.
576 */
577 bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
578 if (bp) {
579 xfs_buf_stale(bp);
580 xfs_buf_relse(bp);
581 bp = NULL;
582 }
583
584 lblkno += map.br_blockcount;
585 blkcnt -= map.br_blockcount;
586 }
587
588 /*
589 * Keep de-allocating extents until the remote-value region is gone.
590 */
591 lblkno = args->rmtblkno;
592 blkcnt = args->rmtblkcnt;
593 done = 0;
594 while (!done) {
595 int committed;
596
597 xfs_bmap_init(args->flist, args->firstblock);
598 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
599 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
600 1, args->firstblock, args->flist,
601 &done);
602 if (!error) {
603 error = xfs_bmap_finish(&args->trans, args->flist,
604 &committed);
605 }
606 if (error) {
607 ASSERT(committed);
608 args->trans = NULL;
609 xfs_bmap_cancel(args->flist);
610 return error;
611 }
612
613 /*
614 * bmap_finish() may have committed the last trans and started
615 * a new one. We need the inode to be in all transactions.
616 */
617 if (committed)
618 xfs_trans_ijoin(args->trans, args->dp, 0);
619
620 /*
621 * Close out trans and start the next one in the chain.
622 */
623 error = xfs_trans_roll(&args->trans, args->dp);
624 if (error)
625 return error;
626 }
627 return 0;
628}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
new file mode 100644
index 000000000000..5a9acfa156d7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (c) 2013 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_ATTR_REMOTE_H__
19#define __XFS_ATTR_REMOTE_H__
20
21int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
22
23int xfs_attr_rmtval_get(struct xfs_da_args *args);
24int xfs_attr_rmtval_set(struct xfs_da_args *args);
25int xfs_attr_rmtval_remove(struct xfs_da_args *args);
26
27#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
new file mode 100644
index 000000000000..919756e3ba53
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -0,0 +1,70 @@
1/*
2 * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_ATTR_SF_H__
19#define __XFS_ATTR_SF_H__
20
21/*
22 * Attribute storage when stored inside the inode.
23 *
24 * Small attribute lists are packed as tightly as possible so as
25 * to fit into the literal area of the inode.
26 */
27
28/*
29 * Entries are packed toward the top as tight as possible.
30 */
31typedef struct xfs_attr_shortform {
32 struct xfs_attr_sf_hdr { /* constant-structure header block */
33 __be16 totsize; /* total bytes in shortform list */
34 __u8 count; /* count of active entries */
35 } hdr;
36 struct xfs_attr_sf_entry {
37 __uint8_t namelen; /* actual length of name (no NULL) */
38 __uint8_t valuelen; /* actual length of value (no NULL) */
39 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
40 __uint8_t nameval[1]; /* name & value bytes concatenated */
41 } list[1]; /* variable sized array */
42} xfs_attr_shortform_t;
43typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
44typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
45
46/*
47 * We generate this then sort it, attr_list() must return things in hash-order.
48 */
49typedef struct xfs_attr_sf_sort {
50 __uint8_t entno; /* entry number in original list */
51 __uint8_t namelen; /* length of name value (no null) */
52 __uint8_t valuelen; /* length of value */
53 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
54 xfs_dahash_t hash; /* this entry's hash value */
55 unsigned char *name; /* name value, pointer into buffer */
56} xfs_attr_sf_sort_t;
57
58#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
59 (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
60#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
61 ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
62#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \
63 ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
64#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
65 ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
66#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \
67 (be16_to_cpu(((xfs_attr_shortform_t *) \
68 ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
69
70#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
new file mode 100644
index 000000000000..e1649c0d3e02
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bit.h
@@ -0,0 +1,87 @@
1/*
2 * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_BIT_H__
19#define __XFS_BIT_H__
20
21/*
22 * XFS bit manipulation routines.
23 */
24
25/*
26 * masks with n high/low bits set, 64-bit values
27 */
28static inline __uint64_t xfs_mask64hi(int n)
29{
30 return (__uint64_t)-1 << (64 - (n));
31}
32static inline __uint32_t xfs_mask32lo(int n)
33{
34 return ((__uint32_t)1 << (n)) - 1;
35}
36static inline __uint64_t xfs_mask64lo(int n)
37{
38 return ((__uint64_t)1 << (n)) - 1;
39}
40
41/* Get high bit set out of 32-bit argument, -1 if none set */
42static inline int xfs_highbit32(__uint32_t v)
43{
44 return fls(v) - 1;
45}
46
47/* Get high bit set out of 64-bit argument, -1 if none set */
48static inline int xfs_highbit64(__uint64_t v)
49{
50 return fls64(v) - 1;
51}
52
53/* Get low bit set out of 32-bit argument, -1 if none set */
54static inline int xfs_lowbit32(__uint32_t v)
55{
56 return ffs(v) - 1;
57}
58
59/* Get low bit set out of 64-bit argument, -1 if none set */
60static inline int xfs_lowbit64(__uint64_t v)
61{
62 __uint32_t w = (__uint32_t)v;
63 int n = 0;
64
65 if (w) { /* lower bits */
66 n = ffs(w);
67 } else { /* upper bits */
68 w = (__uint32_t)(v >> 32);
69 if (w) {
70 n = ffs(w);
71 if (n)
72 n += 32;
73 }
74 }
75 return n - 1;
76}
77
78/* Return whether bitmap is empty (1 == empty) */
79extern int xfs_bitmap_empty(uint *map, uint size);
80
81/* Count continuous one bits in map starting with start_bit */
82extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
83
84/* Find next set bit in map */
85extern int xfs_next_bit(uint *map, uint size, uint start_bit);
86
87#endif /* __XFS_BIT_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
new file mode 100644
index 000000000000..94ac88306fa6
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -0,0 +1,5606 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_inum.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_da_format.h"
30#include "xfs_da_btree.h"
31#include "xfs_dir2.h"
32#include "xfs_inode.h"
33#include "xfs_btree.h"
34#include "xfs_trans.h"
35#include "xfs_inode_item.h"
36#include "xfs_extfree_item.h"
37#include "xfs_alloc.h"
38#include "xfs_bmap.h"
39#include "xfs_bmap_util.h"
40#include "xfs_bmap_btree.h"
41#include "xfs_rtalloc.h"
42#include "xfs_error.h"
43#include "xfs_quota.h"
44#include "xfs_trans_space.h"
45#include "xfs_buf_item.h"
46#include "xfs_trace.h"
47#include "xfs_symlink.h"
48#include "xfs_attr_leaf.h"
49#include "xfs_dinode.h"
50#include "xfs_filestream.h"
51
52
53kmem_zone_t *xfs_bmap_free_item_zone;
54
55/*
56 * Miscellaneous helper functions
57 */
58
59/*
60 * Compute and fill in the value of the maximum depth of a bmap btree
61 * in this filesystem. Done once, during mount.
62 */
63void
64xfs_bmap_compute_maxlevels(
65 xfs_mount_t *mp, /* file system mount structure */
66 int whichfork) /* data or attr fork */
67{
68 int level; /* btree level */
69 uint maxblocks; /* max blocks at this level */
70 uint maxleafents; /* max leaf entries possible */
71 int maxrootrecs; /* max records in root block */
72 int minleafrecs; /* min records in leaf block */
73 int minnoderecs; /* min records in node block */
74 int sz; /* root block size */
75
76 /*
77 * The maximum number of extents in a file, hence the maximum
78 * number of leaf entries, is controlled by the type of di_nextents
79 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
80 * (a signed 16-bit number, xfs_aextnum_t).
81 *
82 * Note that we can no longer assume that if we are in ATTR1 that
83 * the fork offset of all the inodes will be
84 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
85 * with ATTR2 and then mounted back with ATTR1, keeping the
86 * di_forkoff's fixed but probably at various positions. Therefore,
87 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
88 * of a minimum size available.
89 */
90 if (whichfork == XFS_DATA_FORK) {
91 maxleafents = MAXEXTNUM;
92 sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
93 } else {
94 maxleafents = MAXAEXTNUM;
95 sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
96 }
97 maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
98 minleafrecs = mp->m_bmap_dmnr[0];
99 minnoderecs = mp->m_bmap_dmnr[1];
100 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
101 for (level = 1; maxblocks > 1; level++) {
102 if (maxblocks <= maxrootrecs)
103 maxblocks = 1;
104 else
105 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
106 }
107 mp->m_bm_maxlevels[whichfork] = level;
108}
109
110STATIC int /* error */
111xfs_bmbt_lookup_eq(
112 struct xfs_btree_cur *cur,
113 xfs_fileoff_t off,
114 xfs_fsblock_t bno,
115 xfs_filblks_t len,
116 int *stat) /* success/failure */
117{
118 cur->bc_rec.b.br_startoff = off;
119 cur->bc_rec.b.br_startblock = bno;
120 cur->bc_rec.b.br_blockcount = len;
121 return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
122}
123
124STATIC int /* error */
125xfs_bmbt_lookup_ge(
126 struct xfs_btree_cur *cur,
127 xfs_fileoff_t off,
128 xfs_fsblock_t bno,
129 xfs_filblks_t len,
130 int *stat) /* success/failure */
131{
132 cur->bc_rec.b.br_startoff = off;
133 cur->bc_rec.b.br_startblock = bno;
134 cur->bc_rec.b.br_blockcount = len;
135 return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
136}
137
138/*
139 * Check if the inode needs to be converted to btree format.
140 */
141static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
142{
143 return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
144 XFS_IFORK_NEXTENTS(ip, whichfork) >
145 XFS_IFORK_MAXEXT(ip, whichfork);
146}
147
148/*
149 * Check if the inode should be converted to extent format.
150 */
151static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
152{
153 return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
154 XFS_IFORK_NEXTENTS(ip, whichfork) <=
155 XFS_IFORK_MAXEXT(ip, whichfork);
156}
157
158/*
159 * Update the record referred to by cur to the value given
160 * by [off, bno, len, state].
161 * This either works (return 0) or gets an EFSCORRUPTED error.
162 */
163STATIC int
164xfs_bmbt_update(
165 struct xfs_btree_cur *cur,
166 xfs_fileoff_t off,
167 xfs_fsblock_t bno,
168 xfs_filblks_t len,
169 xfs_exntst_t state)
170{
171 union xfs_btree_rec rec;
172
173 xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
174 return xfs_btree_update(cur, &rec);
175}
176
177/*
178 * Compute the worst-case number of indirect blocks that will be used
179 * for ip's delayed extent of length "len".
180 */
181STATIC xfs_filblks_t
182xfs_bmap_worst_indlen(
183 xfs_inode_t *ip, /* incore inode pointer */
184 xfs_filblks_t len) /* delayed extent length */
185{
186 int level; /* btree level number */
187 int maxrecs; /* maximum record count at this level */
188 xfs_mount_t *mp; /* mount structure */
189 xfs_filblks_t rval; /* return value */
190
191 mp = ip->i_mount;
192 maxrecs = mp->m_bmap_dmxr[0];
193 for (level = 0, rval = 0;
194 level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
195 level++) {
196 len += maxrecs - 1;
197 do_div(len, maxrecs);
198 rval += len;
199 if (len == 1)
200 return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
201 level - 1;
202 if (level == 0)
203 maxrecs = mp->m_bmap_dmxr[1];
204 }
205 return rval;
206}
207
208/*
209 * Calculate the default attribute fork offset for newly created inodes.
210 */
211uint
212xfs_default_attroffset(
213 struct xfs_inode *ip)
214{
215 struct xfs_mount *mp = ip->i_mount;
216 uint offset;
217
218 if (mp->m_sb.sb_inodesize == 256) {
219 offset = XFS_LITINO(mp, ip->i_d.di_version) -
220 XFS_BMDR_SPACE_CALC(MINABTPTRS);
221 } else {
222 offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
223 }
224
225 ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
226 return offset;
227}
228
229/*
230 * Helper routine to reset inode di_forkoff field when switching
231 * attribute fork from local to extent format - we reset it where
232 * possible to make space available for inline data fork extents.
233 */
234STATIC void
235xfs_bmap_forkoff_reset(
236 xfs_inode_t *ip,
237 int whichfork)
238{
239 if (whichfork == XFS_ATTR_FORK &&
240 ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
241 ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
242 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
243 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
244
245 if (dfl_forkoff > ip->i_d.di_forkoff)
246 ip->i_d.di_forkoff = dfl_forkoff;
247 }
248}
249
250/*
251 * Debug/sanity checking code
252 */
253
254STATIC int
255xfs_bmap_sanity_check(
256 struct xfs_mount *mp,
257 struct xfs_buf *bp,
258 int level)
259{
260 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
261
262 if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
263 block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
264 return 0;
265
266 if (be16_to_cpu(block->bb_level) != level ||
267 be16_to_cpu(block->bb_numrecs) == 0 ||
268 be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
269 return 0;
270
271 return 1;
272}
273
274#ifdef DEBUG
275STATIC struct xfs_buf *
276xfs_bmap_get_bp(
277 struct xfs_btree_cur *cur,
278 xfs_fsblock_t bno)
279{
280 struct xfs_log_item_desc *lidp;
281 int i;
282
283 if (!cur)
284 return NULL;
285
286 for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
287 if (!cur->bc_bufs[i])
288 break;
289 if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
290 return cur->bc_bufs[i];
291 }
292
293 /* Chase down all the log items to see if the bp is there */
294 list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
295 struct xfs_buf_log_item *bip;
296 bip = (struct xfs_buf_log_item *)lidp->lid_item;
297 if (bip->bli_item.li_type == XFS_LI_BUF &&
298 XFS_BUF_ADDR(bip->bli_buf) == bno)
299 return bip->bli_buf;
300 }
301
302 return NULL;
303}
304
305STATIC void
306xfs_check_block(
307 struct xfs_btree_block *block,
308 xfs_mount_t *mp,
309 int root,
310 short sz)
311{
312 int i, j, dmxr;
313 __be64 *pp, *thispa; /* pointer to block address */
314 xfs_bmbt_key_t *prevp, *keyp;
315
316 ASSERT(be16_to_cpu(block->bb_level) > 0);
317
318 prevp = NULL;
319 for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
320 dmxr = mp->m_bmap_dmxr[0];
321 keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
322
323 if (prevp) {
324 ASSERT(be64_to_cpu(prevp->br_startoff) <
325 be64_to_cpu(keyp->br_startoff));
326 }
327 prevp = keyp;
328
329 /*
330 * Compare the block numbers to see if there are dups.
331 */
332 if (root)
333 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
334 else
335 pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
336
337 for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
338 if (root)
339 thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
340 else
341 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
342 if (*thispa == *pp) {
343 xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
344 __func__, j, i,
345 (unsigned long long)be64_to_cpu(*thispa));
346 panic("%s: ptrs are equal in node\n",
347 __func__);
348 }
349 }
350 }
351}
352
353/*
354 * Check that the extents for the inode ip are in the right order in all
355 * btree leaves.
356 */
357
358STATIC void
359xfs_bmap_check_leaf_extents(
360 xfs_btree_cur_t *cur, /* btree cursor or null */
361 xfs_inode_t *ip, /* incore inode pointer */
362 int whichfork) /* data or attr fork */
363{
364 struct xfs_btree_block *block; /* current btree block */
365 xfs_fsblock_t bno; /* block # of "block" */
366 xfs_buf_t *bp; /* buffer for "block" */
367 int error; /* error return value */
368 xfs_extnum_t i=0, j; /* index into the extents list */
369 xfs_ifork_t *ifp; /* fork structure */
370 int level; /* btree level, for checking */
371 xfs_mount_t *mp; /* file system mount structure */
372 __be64 *pp; /* pointer to block address */
373 xfs_bmbt_rec_t *ep; /* pointer to current extent */
374 xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
375 xfs_bmbt_rec_t *nextp; /* pointer to next extent */
376 int bp_release = 0;
377
378 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
379 return;
380 }
381
382 bno = NULLFSBLOCK;
383 mp = ip->i_mount;
384 ifp = XFS_IFORK_PTR(ip, whichfork);
385 block = ifp->if_broot;
386 /*
387 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
388 */
389 level = be16_to_cpu(block->bb_level);
390 ASSERT(level > 0);
391 xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
392 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
393 bno = be64_to_cpu(*pp);
394
395 ASSERT(bno != NULLDFSBNO);
396 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
397 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
398
399 /*
400 * Go down the tree until leaf level is reached, following the first
401 * pointer (leftmost) at each level.
402 */
403 while (level-- > 0) {
404 /* See if buf is in cur first */
405 bp_release = 0;
406 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
407 if (!bp) {
408 bp_release = 1;
409 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
410 XFS_BMAP_BTREE_REF,
411 &xfs_bmbt_buf_ops);
412 if (error)
413 goto error_norelse;
414 }
415 block = XFS_BUF_TO_BLOCK(bp);
416 XFS_WANT_CORRUPTED_GOTO(
417 xfs_bmap_sanity_check(mp, bp, level),
418 error0);
419 if (level == 0)
420 break;
421
422 /*
423 * Check this block for basic sanity (increasing keys and
424 * no duplicate blocks).
425 */
426
427 xfs_check_block(block, mp, 0, 0);
428 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
429 bno = be64_to_cpu(*pp);
430 XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
431 if (bp_release) {
432 bp_release = 0;
433 xfs_trans_brelse(NULL, bp);
434 }
435 }
436
437 /*
438 * Here with bp and block set to the leftmost leaf node in the tree.
439 */
440 i = 0;
441
442 /*
443 * Loop over all leaf nodes checking that all extents are in the right order.
444 */
445 for (;;) {
446 xfs_fsblock_t nextbno;
447 xfs_extnum_t num_recs;
448
449
450 num_recs = xfs_btree_get_numrecs(block);
451
452 /*
453 * Read-ahead the next leaf block, if any.
454 */
455
456 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
457
458 /*
459 * Check all the extents to make sure they are OK.
460 * If we had a previous block, the last entry should
461 * conform with the first entry in this one.
462 */
463
464 ep = XFS_BMBT_REC_ADDR(mp, block, 1);
465 if (i) {
466 ASSERT(xfs_bmbt_disk_get_startoff(&last) +
467 xfs_bmbt_disk_get_blockcount(&last) <=
468 xfs_bmbt_disk_get_startoff(ep));
469 }
470 for (j = 1; j < num_recs; j++) {
471 nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
472 ASSERT(xfs_bmbt_disk_get_startoff(ep) +
473 xfs_bmbt_disk_get_blockcount(ep) <=
474 xfs_bmbt_disk_get_startoff(nextp));
475 ep = nextp;
476 }
477
478 last = *ep;
479 i += num_recs;
480 if (bp_release) {
481 bp_release = 0;
482 xfs_trans_brelse(NULL, bp);
483 }
484 bno = nextbno;
485 /*
486 * If we've reached the end, stop.
487 */
488 if (bno == NULLFSBLOCK)
489 break;
490
491 bp_release = 0;
492 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
493 if (!bp) {
494 bp_release = 1;
495 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
496 XFS_BMAP_BTREE_REF,
497 &xfs_bmbt_buf_ops);
498 if (error)
499 goto error_norelse;
500 }
501 block = XFS_BUF_TO_BLOCK(bp);
502 }
503 if (bp_release) {
504 bp_release = 0;
505 xfs_trans_brelse(NULL, bp);
506 }
507 return;
508
509error0:
510 xfs_warn(mp, "%s: at error0", __func__);
511 if (bp_release)
512 xfs_trans_brelse(NULL, bp);
513error_norelse:
514 xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
515 __func__, i);
516 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
517 return;
518}
519
520/*
521 * Add bmap trace insert entries for all the contents of the extent records.
522 */
523void
524xfs_bmap_trace_exlist(
525 xfs_inode_t *ip, /* incore inode pointer */
526 xfs_extnum_t cnt, /* count of entries in the list */
527 int whichfork, /* data or attr fork */
528 unsigned long caller_ip)
529{
530 xfs_extnum_t idx; /* extent record index */
531 xfs_ifork_t *ifp; /* inode fork pointer */
532 int state = 0;
533
534 if (whichfork == XFS_ATTR_FORK)
535 state |= BMAP_ATTRFORK;
536
537 ifp = XFS_IFORK_PTR(ip, whichfork);
538 ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
539 for (idx = 0; idx < cnt; idx++)
540 trace_xfs_extlist(ip, idx, whichfork, caller_ip);
541}
542
543/*
544 * Validate that the bmbt_irecs being returned from bmapi are valid
545 * given the caller's original parameters. Specifically check the
546 * ranges of the returned irecs to ensure that they only extend beyond
547 * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
548 */
549STATIC void
550xfs_bmap_validate_ret(
551 xfs_fileoff_t bno,
552 xfs_filblks_t len,
553 int flags,
554 xfs_bmbt_irec_t *mval,
555 int nmap,
556 int ret_nmap)
557{
558 int i; /* index to map values */
559
560 ASSERT(ret_nmap <= nmap);
561
562 for (i = 0; i < ret_nmap; i++) {
563 ASSERT(mval[i].br_blockcount > 0);
564 if (!(flags & XFS_BMAPI_ENTIRE)) {
565 ASSERT(mval[i].br_startoff >= bno);
566 ASSERT(mval[i].br_blockcount <= len);
567 ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
568 bno + len);
569 } else {
570 ASSERT(mval[i].br_startoff < bno + len);
571 ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
572 bno);
573 }
574 ASSERT(i == 0 ||
575 mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
576 mval[i].br_startoff);
577 ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
578 mval[i].br_startblock != HOLESTARTBLOCK);
579 ASSERT(mval[i].br_state == XFS_EXT_NORM ||
580 mval[i].br_state == XFS_EXT_UNWRITTEN);
581 }
582}
583
584#else
585#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
586#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
587#endif /* DEBUG */
588
589/*
590 * bmap free list manipulation functions
591 */
592
593/*
594 * Add the extent to the list of extents to be free at transaction end.
595 * The list is maintained sorted (by block number).
596 */
597void
598xfs_bmap_add_free(
599 xfs_fsblock_t bno, /* fs block number of extent */
600 xfs_filblks_t len, /* length of extent */
601 xfs_bmap_free_t *flist, /* list of extents */
602 xfs_mount_t *mp) /* mount point structure */
603{
604 xfs_bmap_free_item_t *cur; /* current (next) element */
605 xfs_bmap_free_item_t *new; /* new element */
606 xfs_bmap_free_item_t *prev; /* previous element */
607#ifdef DEBUG
608 xfs_agnumber_t agno;
609 xfs_agblock_t agbno;
610
611 ASSERT(bno != NULLFSBLOCK);
612 ASSERT(len > 0);
613 ASSERT(len <= MAXEXTLEN);
614 ASSERT(!isnullstartblock(bno));
615 agno = XFS_FSB_TO_AGNO(mp, bno);
616 agbno = XFS_FSB_TO_AGBNO(mp, bno);
617 ASSERT(agno < mp->m_sb.sb_agcount);
618 ASSERT(agbno < mp->m_sb.sb_agblocks);
619 ASSERT(len < mp->m_sb.sb_agblocks);
620 ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
621#endif
622 ASSERT(xfs_bmap_free_item_zone != NULL);
623 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
624 new->xbfi_startblock = bno;
625 new->xbfi_blockcount = (xfs_extlen_t)len;
626 for (prev = NULL, cur = flist->xbf_first;
627 cur != NULL;
628 prev = cur, cur = cur->xbfi_next) {
629 if (cur->xbfi_startblock >= bno)
630 break;
631 }
632 if (prev)
633 prev->xbfi_next = new;
634 else
635 flist->xbf_first = new;
636 new->xbfi_next = cur;
637 flist->xbf_count++;
638}
639
640/*
641 * Remove the entry "free" from the free item list. Prev points to the
642 * previous entry, unless "free" is the head of the list.
643 */
644void
645xfs_bmap_del_free(
646 xfs_bmap_free_t *flist, /* free item list header */
647 xfs_bmap_free_item_t *prev, /* previous item on list, if any */
648 xfs_bmap_free_item_t *free) /* list item to be freed */
649{
650 if (prev)
651 prev->xbfi_next = free->xbfi_next;
652 else
653 flist->xbf_first = free->xbfi_next;
654 flist->xbf_count--;
655 kmem_zone_free(xfs_bmap_free_item_zone, free);
656}
657
658/*
659 * Free up any items left in the list.
660 */
661void
662xfs_bmap_cancel(
663 xfs_bmap_free_t *flist) /* list of bmap_free_items */
664{
665 xfs_bmap_free_item_t *free; /* free list item */
666 xfs_bmap_free_item_t *next;
667
668 if (flist->xbf_count == 0)
669 return;
670 ASSERT(flist->xbf_first != NULL);
671 for (free = flist->xbf_first; free; free = next) {
672 next = free->xbfi_next;
673 xfs_bmap_del_free(flist, NULL, free);
674 }
675 ASSERT(flist->xbf_count == 0);
676}
677
678/*
679 * Inode fork format manipulation functions
680 */
681
682/*
683 * Transform a btree format file with only one leaf node, where the
684 * extents list will fit in the inode, into an extents format file.
685 * Since the file extents are already in-core, all we have to do is
686 * give up the space for the btree root and pitch the leaf block.
687 */
688STATIC int /* error */
689xfs_bmap_btree_to_extents(
690 xfs_trans_t *tp, /* transaction pointer */
691 xfs_inode_t *ip, /* incore inode pointer */
692 xfs_btree_cur_t *cur, /* btree cursor */
693 int *logflagsp, /* inode logging flags */
694 int whichfork) /* data or attr fork */
695{
696 /* REFERENCED */
697 struct xfs_btree_block *cblock;/* child btree block */
698 xfs_fsblock_t cbno; /* child block number */
699 xfs_buf_t *cbp; /* child block's buffer */
700 int error; /* error return value */
701 xfs_ifork_t *ifp; /* inode fork data */
702 xfs_mount_t *mp; /* mount point structure */
703 __be64 *pp; /* ptr to block address */
704 struct xfs_btree_block *rblock;/* root btree block */
705
706 mp = ip->i_mount;
707 ifp = XFS_IFORK_PTR(ip, whichfork);
708 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
709 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
710 rblock = ifp->if_broot;
711 ASSERT(be16_to_cpu(rblock->bb_level) == 1);
712 ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
713 ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
714 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
715 cbno = be64_to_cpu(*pp);
716 *logflagsp = 0;
717#ifdef DEBUG
718 if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
719 return error;
720#endif
721 error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
722 &xfs_bmbt_buf_ops);
723 if (error)
724 return error;
725 cblock = XFS_BUF_TO_BLOCK(cbp);
726 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
727 return error;
728 xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
729 ip->i_d.di_nblocks--;
730 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
731 xfs_trans_binval(tp, cbp);
732 if (cur->bc_bufs[0] == cbp)
733 cur->bc_bufs[0] = NULL;
734 xfs_iroot_realloc(ip, -1, whichfork);
735 ASSERT(ifp->if_broot == NULL);
736 ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
737 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
738 *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
739 return 0;
740}
741
742/*
743 * Convert an extents-format file into a btree-format file.
744 * The new file will have a root block (in the inode) and a single child block.
745 */
746STATIC int /* error */
747xfs_bmap_extents_to_btree(
748 xfs_trans_t *tp, /* transaction pointer */
749 xfs_inode_t *ip, /* incore inode pointer */
750 xfs_fsblock_t *firstblock, /* first-block-allocated */
751 xfs_bmap_free_t *flist, /* blocks freed in xaction */
752 xfs_btree_cur_t **curp, /* cursor returned to caller */
753 int wasdel, /* converting a delayed alloc */
754 int *logflagsp, /* inode logging flags */
755 int whichfork) /* data or attr fork */
756{
757 struct xfs_btree_block *ablock; /* allocated (child) bt block */
758 xfs_buf_t *abp; /* buffer for ablock */
759 xfs_alloc_arg_t args; /* allocation arguments */
760 xfs_bmbt_rec_t *arp; /* child record pointer */
761 struct xfs_btree_block *block; /* btree root block */
762 xfs_btree_cur_t *cur; /* bmap btree cursor */
763 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
764 int error; /* error return value */
765 xfs_extnum_t i, cnt; /* extent record index */
766 xfs_ifork_t *ifp; /* inode fork pointer */
767 xfs_bmbt_key_t *kp; /* root block key pointer */
768 xfs_mount_t *mp; /* mount structure */
769 xfs_extnum_t nextents; /* number of file extents */
770 xfs_bmbt_ptr_t *pp; /* root block address pointer */
771
772 mp = ip->i_mount;
773 ifp = XFS_IFORK_PTR(ip, whichfork);
774 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
775
776 /*
777 * Make space in the inode incore.
778 */
779 xfs_iroot_realloc(ip, 1, whichfork);
780 ifp->if_flags |= XFS_IFBROOT;
781
782 /*
783 * Fill in the root.
784 */
785 block = ifp->if_broot;
786 if (xfs_sb_version_hascrc(&mp->m_sb))
787 xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
788 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
789 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
790 else
791 xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
792 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
793 XFS_BTREE_LONG_PTRS);
794
795 /*
796 * Need a cursor. Can't allocate until bb_level is filled in.
797 */
798 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
799 cur->bc_private.b.firstblock = *firstblock;
800 cur->bc_private.b.flist = flist;
801 cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
802 /*
803 * Convert to a btree with two levels, one record in root.
804 */
805 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
806 memset(&args, 0, sizeof(args));
807 args.tp = tp;
808 args.mp = mp;
809 args.firstblock = *firstblock;
810 if (*firstblock == NULLFSBLOCK) {
811 args.type = XFS_ALLOCTYPE_START_BNO;
812 args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
813 } else if (flist->xbf_low) {
814 args.type = XFS_ALLOCTYPE_START_BNO;
815 args.fsbno = *firstblock;
816 } else {
817 args.type = XFS_ALLOCTYPE_NEAR_BNO;
818 args.fsbno = *firstblock;
819 }
820 args.minlen = args.maxlen = args.prod = 1;
821 args.wasdel = wasdel;
822 *logflagsp = 0;
823 if ((error = xfs_alloc_vextent(&args))) {
824 xfs_iroot_realloc(ip, -1, whichfork);
825 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
826 return error;
827 }
828 /*
829 * Allocation can't fail, the space was reserved.
830 */
831 ASSERT(args.fsbno != NULLFSBLOCK);
832 ASSERT(*firstblock == NULLFSBLOCK ||
833 args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
834 (flist->xbf_low &&
835 args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
836 *firstblock = cur->bc_private.b.firstblock = args.fsbno;
837 cur->bc_private.b.allocated++;
838 ip->i_d.di_nblocks++;
839 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
840 abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
841 /*
842 * Fill in the child block.
843 */
844 abp->b_ops = &xfs_bmbt_buf_ops;
845 ablock = XFS_BUF_TO_BLOCK(abp);
846 if (xfs_sb_version_hascrc(&mp->m_sb))
847 xfs_btree_init_block_int(mp, ablock, abp->b_bn,
848 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
849 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
850 else
851 xfs_btree_init_block_int(mp, ablock, abp->b_bn,
852 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
853 XFS_BTREE_LONG_PTRS);
854
855 arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
856 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
857 for (cnt = i = 0; i < nextents; i++) {
858 ep = xfs_iext_get_ext(ifp, i);
859 if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
860 arp->l0 = cpu_to_be64(ep->l0);
861 arp->l1 = cpu_to_be64(ep->l1);
862 arp++; cnt++;
863 }
864 }
865 ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
866 xfs_btree_set_numrecs(ablock, cnt);
867
868 /*
869 * Fill in the root key and pointer.
870 */
871 kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
872 arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
873 kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
874 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
875 be16_to_cpu(block->bb_level)));
876 *pp = cpu_to_be64(args.fsbno);
877
878 /*
879 * Do all this logging at the end so that
880 * the root is at the right level.
881 */
882 xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
883 xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
884 ASSERT(*curp == NULL);
885 *curp = cur;
886 *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
887 return 0;
888}
889
890/*
891 * Convert a local file to an extents file.
892 * This code is out of bounds for data forks of regular files,
893 * since the file data needs to get logged so things will stay consistent.
894 * (The bmap-level manipulations are ok, though).
895 */
896void
897xfs_bmap_local_to_extents_empty(
898 struct xfs_inode *ip,
899 int whichfork)
900{
901 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
902
903 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
904 ASSERT(ifp->if_bytes == 0);
905 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
906
907 xfs_bmap_forkoff_reset(ip, whichfork);
908 ifp->if_flags &= ~XFS_IFINLINE;
909 ifp->if_flags |= XFS_IFEXTENTS;
910 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
911}
912
913
914STATIC int /* error */
915xfs_bmap_local_to_extents(
916 xfs_trans_t *tp, /* transaction pointer */
917 xfs_inode_t *ip, /* incore inode pointer */
918 xfs_fsblock_t *firstblock, /* first block allocated in xaction */
919 xfs_extlen_t total, /* total blocks needed by transaction */
920 int *logflagsp, /* inode logging flags */
921 int whichfork,
922 void (*init_fn)(struct xfs_trans *tp,
923 struct xfs_buf *bp,
924 struct xfs_inode *ip,
925 struct xfs_ifork *ifp))
926{
927 int error = 0;
928 int flags; /* logging flags returned */
929 xfs_ifork_t *ifp; /* inode fork pointer */
930 xfs_alloc_arg_t args; /* allocation arguments */
931 xfs_buf_t *bp; /* buffer for extent block */
932 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
933
934 /*
935 * We don't want to deal with the case of keeping inode data inline yet.
936 * So sending the data fork of a regular inode is invalid.
937 */
938 ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
939 ifp = XFS_IFORK_PTR(ip, whichfork);
940 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
941
942 if (!ifp->if_bytes) {
943 xfs_bmap_local_to_extents_empty(ip, whichfork);
944 flags = XFS_ILOG_CORE;
945 goto done;
946 }
947
948 flags = 0;
949 error = 0;
950 ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
951 XFS_IFINLINE);
952 memset(&args, 0, sizeof(args));
953 args.tp = tp;
954 args.mp = ip->i_mount;
955 args.firstblock = *firstblock;
956 /*
957 * Allocate a block. We know we need only one, since the
958 * file currently fits in an inode.
959 */
960 if (*firstblock == NULLFSBLOCK) {
961 args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
962 args.type = XFS_ALLOCTYPE_START_BNO;
963 } else {
964 args.fsbno = *firstblock;
965 args.type = XFS_ALLOCTYPE_NEAR_BNO;
966 }
967 args.total = total;
968 args.minlen = args.maxlen = args.prod = 1;
969 error = xfs_alloc_vextent(&args);
970 if (error)
971 goto done;
972
973 /* Can't fail, the space was reserved. */
974 ASSERT(args.fsbno != NULLFSBLOCK);
975 ASSERT(args.len == 1);
976 *firstblock = args.fsbno;
977 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
978
979 /* initialise the block and copy the data */
980 init_fn(tp, bp, ip, ifp);
981
982 /* account for the change in fork size and log everything */
983 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
984 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
985 xfs_bmap_local_to_extents_empty(ip, whichfork);
986 flags |= XFS_ILOG_CORE;
987
988 xfs_iext_add(ifp, 0, 1);
989 ep = xfs_iext_get_ext(ifp, 0);
990 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
991 trace_xfs_bmap_post_update(ip, 0,
992 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
993 _THIS_IP_);
994 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
995 ip->i_d.di_nblocks = 1;
996 xfs_trans_mod_dquot_byino(tp, ip,
997 XFS_TRANS_DQ_BCOUNT, 1L);
998 flags |= xfs_ilog_fext(whichfork);
999
1000done:
1001 *logflagsp = flags;
1002 return error;
1003}
1004
1005/*
1006 * Called from xfs_bmap_add_attrfork to handle btree format files.
1007 */
1008STATIC int /* error */
1009xfs_bmap_add_attrfork_btree(
1010 xfs_trans_t *tp, /* transaction pointer */
1011 xfs_inode_t *ip, /* incore inode pointer */
1012 xfs_fsblock_t *firstblock, /* first block allocated */
1013 xfs_bmap_free_t *flist, /* blocks to free at commit */
1014 int *flags) /* inode logging flags */
1015{
1016 xfs_btree_cur_t *cur; /* btree cursor */
1017 int error; /* error return value */
1018 xfs_mount_t *mp; /* file system mount struct */
1019 int stat; /* newroot status */
1020
1021 mp = ip->i_mount;
1022 if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
1023 *flags |= XFS_ILOG_DBROOT;
1024 else {
1025 cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
1026 cur->bc_private.b.flist = flist;
1027 cur->bc_private.b.firstblock = *firstblock;
1028 if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
1029 goto error0;
1030 /* must be at least one entry */
1031 XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
1032 if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
1033 goto error0;
1034 if (stat == 0) {
1035 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1036 return -ENOSPC;
1037 }
1038 *firstblock = cur->bc_private.b.firstblock;
1039 cur->bc_private.b.allocated = 0;
1040 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1041 }
1042 return 0;
1043error0:
1044 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1045 return error;
1046}
1047
1048/*
1049 * Called from xfs_bmap_add_attrfork to handle extents format files.
1050 */
1051STATIC int /* error */
1052xfs_bmap_add_attrfork_extents(
1053 xfs_trans_t *tp, /* transaction pointer */
1054 xfs_inode_t *ip, /* incore inode pointer */
1055 xfs_fsblock_t *firstblock, /* first block allocated */
1056 xfs_bmap_free_t *flist, /* blocks to free at commit */
1057 int *flags) /* inode logging flags */
1058{
1059 xfs_btree_cur_t *cur; /* bmap btree cursor */
1060 int error; /* error return value */
1061
1062 if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
1063 return 0;
1064 cur = NULL;
1065 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
1066 flags, XFS_DATA_FORK);
1067 if (cur) {
1068 cur->bc_private.b.allocated = 0;
1069 xfs_btree_del_cursor(cur,
1070 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
1071 }
1072 return error;
1073}
1074
1075/*
1076 * Called from xfs_bmap_add_attrfork to handle local format files. Each
1077 * different data fork content type needs a different callout to do the
1078 * conversion. Some are basic and only require special block initialisation
1079 * callouts for the data formating, others (directories) are so specialised they
1080 * handle everything themselves.
1081 *
1082 * XXX (dgc): investigate whether directory conversion can use the generic
1083 * formatting callout. It should be possible - it's just a very complex
1084 * formatter.
1085 */
1086STATIC int /* error */
1087xfs_bmap_add_attrfork_local(
1088 xfs_trans_t *tp, /* transaction pointer */
1089 xfs_inode_t *ip, /* incore inode pointer */
1090 xfs_fsblock_t *firstblock, /* first block allocated */
1091 xfs_bmap_free_t *flist, /* blocks to free at commit */
1092 int *flags) /* inode logging flags */
1093{
1094 xfs_da_args_t dargs; /* args for dir/attr code */
1095
1096 if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
1097 return 0;
1098
1099 if (S_ISDIR(ip->i_d.di_mode)) {
1100 memset(&dargs, 0, sizeof(dargs));
1101 dargs.geo = ip->i_mount->m_dir_geo;
1102 dargs.dp = ip;
1103 dargs.firstblock = firstblock;
1104 dargs.flist = flist;
1105 dargs.total = dargs.geo->fsbcount;
1106 dargs.whichfork = XFS_DATA_FORK;
1107 dargs.trans = tp;
1108 return xfs_dir2_sf_to_block(&dargs);
1109 }
1110
1111 if (S_ISLNK(ip->i_d.di_mode))
1112 return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
1113 flags, XFS_DATA_FORK,
1114 xfs_symlink_local_to_remote);
1115
1116 /* should only be called for types that support local format data */
1117 ASSERT(0);
1118 return -EFSCORRUPTED;
1119}
1120
1121/*
1122 * Convert inode from non-attributed to attributed.
1123 * Must not be in a transaction, ip must not be locked.
1124 */
1125int /* error code */
1126xfs_bmap_add_attrfork(
1127 xfs_inode_t *ip, /* incore inode pointer */
1128 int size, /* space new attribute needs */
1129 int rsvd) /* xact may use reserved blks */
1130{
1131 xfs_fsblock_t firstblock; /* 1st block/ag allocated */
1132 xfs_bmap_free_t flist; /* freed extent records */
1133 xfs_mount_t *mp; /* mount structure */
1134 xfs_trans_t *tp; /* transaction pointer */
1135 int blks; /* space reservation */
1136 int version = 1; /* superblock attr version */
1137 int committed; /* xaction was committed */
1138 int logflags; /* logging flags */
1139 int error; /* error return value */
1140 int cancel_flags = 0;
1141
1142 ASSERT(XFS_IFORK_Q(ip) == 0);
1143
1144 mp = ip->i_mount;
1145 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1146 tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
1147 blks = XFS_ADDAFORK_SPACE_RES(mp);
1148 if (rsvd)
1149 tp->t_flags |= XFS_TRANS_RESERVE;
1150 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
1151 if (error) {
1152 xfs_trans_cancel(tp, 0);
1153 return error;
1154 }
1155 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1156 xfs_ilock(ip, XFS_ILOCK_EXCL);
1157 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
1158 XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
1159 XFS_QMOPT_RES_REGBLKS);
1160 if (error)
1161 goto trans_cancel;
1162 cancel_flags |= XFS_TRANS_ABORT;
1163 if (XFS_IFORK_Q(ip))
1164 goto trans_cancel;
1165 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
1166 /*
1167 * For inodes coming from pre-6.2 filesystems.
1168 */
1169 ASSERT(ip->i_d.di_aformat == 0);
1170 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1171 }
1172 ASSERT(ip->i_d.di_anextents == 0);
1173
1174 xfs_trans_ijoin(tp, ip, 0);
1175 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1176
1177 switch (ip->i_d.di_format) {
1178 case XFS_DINODE_FMT_DEV:
1179 ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
1180 break;
1181 case XFS_DINODE_FMT_UUID:
1182 ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
1183 break;
1184 case XFS_DINODE_FMT_LOCAL:
1185 case XFS_DINODE_FMT_EXTENTS:
1186 case XFS_DINODE_FMT_BTREE:
1187 ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
1188 if (!ip->i_d.di_forkoff)
1189 ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
1190 else if (mp->m_flags & XFS_MOUNT_ATTR2)
1191 version = 2;
1192 break;
1193 default:
1194 ASSERT(0);
1195 error = -EINVAL;
1196 goto trans_cancel;
1197 }
1198
1199 ASSERT(ip->i_afp == NULL);
1200 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
1201 ip->i_afp->if_flags = XFS_IFEXTENTS;
1202 logflags = 0;
1203 xfs_bmap_init(&flist, &firstblock);
1204 switch (ip->i_d.di_format) {
1205 case XFS_DINODE_FMT_LOCAL:
1206 error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
1207 &logflags);
1208 break;
1209 case XFS_DINODE_FMT_EXTENTS:
1210 error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
1211 &flist, &logflags);
1212 break;
1213 case XFS_DINODE_FMT_BTREE:
1214 error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
1215 &logflags);
1216 break;
1217 default:
1218 error = 0;
1219 break;
1220 }
1221 if (logflags)
1222 xfs_trans_log_inode(tp, ip, logflags);
1223 if (error)
1224 goto bmap_cancel;
1225 if (!xfs_sb_version_hasattr(&mp->m_sb) ||
1226 (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
1227 __int64_t sbfields = 0;
1228
1229 spin_lock(&mp->m_sb_lock);
1230 if (!xfs_sb_version_hasattr(&mp->m_sb)) {
1231 xfs_sb_version_addattr(&mp->m_sb);
1232 sbfields |= XFS_SB_VERSIONNUM;
1233 }
1234 if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
1235 xfs_sb_version_addattr2(&mp->m_sb);
1236 sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
1237 }
1238 if (sbfields) {
1239 spin_unlock(&mp->m_sb_lock);
1240 xfs_mod_sb(tp, sbfields);
1241 } else
1242 spin_unlock(&mp->m_sb_lock);
1243 }
1244
1245 error = xfs_bmap_finish(&tp, &flist, &committed);
1246 if (error)
1247 goto bmap_cancel;
1248 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1249 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1250 return error;
1251
1252bmap_cancel:
1253 xfs_bmap_cancel(&flist);
1254trans_cancel:
1255 xfs_trans_cancel(tp, cancel_flags);
1256 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1257 return error;
1258}
1259
1260/*
1261 * Internal and external extent tree search functions.
1262 */
1263
1264/*
1265 * Read in the extents to if_extents.
1266 * All inode fields are set up by caller, we just traverse the btree
1267 * and copy the records in. If the file system cannot contain unwritten
1268 * extents, the records are checked for no "state" flags.
1269 */
1270int /* error */
1271xfs_bmap_read_extents(
1272 xfs_trans_t *tp, /* transaction pointer */
1273 xfs_inode_t *ip, /* incore inode */
1274 int whichfork) /* data or attr fork */
1275{
1276 struct xfs_btree_block *block; /* current btree block */
1277 xfs_fsblock_t bno; /* block # of "block" */
1278 xfs_buf_t *bp; /* buffer for "block" */
1279 int error; /* error return value */
1280 xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
1281 xfs_extnum_t i, j; /* index into the extents list */
1282 xfs_ifork_t *ifp; /* fork structure */
1283 int level; /* btree level, for checking */
1284 xfs_mount_t *mp; /* file system mount structure */
1285 __be64 *pp; /* pointer to block address */
1286 /* REFERENCED */
1287 xfs_extnum_t room; /* number of entries there's room for */
1288
1289 bno = NULLFSBLOCK;
1290 mp = ip->i_mount;
1291 ifp = XFS_IFORK_PTR(ip, whichfork);
1292 exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
1293 XFS_EXTFMT_INODE(ip);
1294 block = ifp->if_broot;
1295 /*
1296 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
1297 */
1298 level = be16_to_cpu(block->bb_level);
1299 ASSERT(level > 0);
1300 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
1301 bno = be64_to_cpu(*pp);
1302 ASSERT(bno != NULLDFSBNO);
1303 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
1304 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
1305 /*
1306 * Go down the tree until leaf level is reached, following the first
1307 * pointer (leftmost) at each level.
1308 */
1309 while (level-- > 0) {
1310 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
1311 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
1312 if (error)
1313 return error;
1314 block = XFS_BUF_TO_BLOCK(bp);
1315 XFS_WANT_CORRUPTED_GOTO(
1316 xfs_bmap_sanity_check(mp, bp, level),
1317 error0);
1318 if (level == 0)
1319 break;
1320 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
1321 bno = be64_to_cpu(*pp);
1322 XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
1323 xfs_trans_brelse(tp, bp);
1324 }
1325 /*
1326 * Here with bp and block set to the leftmost leaf node in the tree.
1327 */
1328 room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1329 i = 0;
1330 /*
1331 * Loop over all leaf nodes. Copy information to the extent records.
1332 */
1333 for (;;) {
1334 xfs_bmbt_rec_t *frp;
1335 xfs_fsblock_t nextbno;
1336 xfs_extnum_t num_recs;
1337 xfs_extnum_t start;
1338
1339 num_recs = xfs_btree_get_numrecs(block);
1340 if (unlikely(i + num_recs > room)) {
1341 ASSERT(i + num_recs <= room);
1342 xfs_warn(ip->i_mount,
1343 "corrupt dinode %Lu, (btree extents).",
1344 (unsigned long long) ip->i_ino);
1345 XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
1346 XFS_ERRLEVEL_LOW, ip->i_mount, block);
1347 goto error0;
1348 }
1349 XFS_WANT_CORRUPTED_GOTO(
1350 xfs_bmap_sanity_check(mp, bp, 0),
1351 error0);
1352 /*
1353 * Read-ahead the next leaf block, if any.
1354 */
1355 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
1356 if (nextbno != NULLFSBLOCK)
1357 xfs_btree_reada_bufl(mp, nextbno, 1,
1358 &xfs_bmbt_buf_ops);
1359 /*
1360 * Copy records into the extent records.
1361 */
1362 frp = XFS_BMBT_REC_ADDR(mp, block, 1);
1363 start = i;
1364 for (j = 0; j < num_recs; j++, i++, frp++) {
1365 xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
1366 trp->l0 = be64_to_cpu(frp->l0);
1367 trp->l1 = be64_to_cpu(frp->l1);
1368 }
1369 if (exntf == XFS_EXTFMT_NOSTATE) {
1370 /*
1371 * Check all attribute bmap btree records and
1372 * any "older" data bmap btree records for a
1373 * set bit in the "extent flag" position.
1374 */
1375 if (unlikely(xfs_check_nostate_extents(ifp,
1376 start, num_recs))) {
1377 XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
1378 XFS_ERRLEVEL_LOW,
1379 ip->i_mount);
1380 goto error0;
1381 }
1382 }
1383 xfs_trans_brelse(tp, bp);
1384 bno = nextbno;
1385 /*
1386 * If we've reached the end, stop.
1387 */
1388 if (bno == NULLFSBLOCK)
1389 break;
1390 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
1391 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
1392 if (error)
1393 return error;
1394 block = XFS_BUF_TO_BLOCK(bp);
1395 }
1396 ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
1397 ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
1398 XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
1399 return 0;
1400error0:
1401 xfs_trans_brelse(tp, bp);
1402 return -EFSCORRUPTED;
1403}
1404
1405
1406/*
1407 * Search the extent records for the entry containing block bno.
1408 * If bno lies in a hole, point to the next entry. If bno lies
1409 * past eof, *eofp will be set, and *prevp will contain the last
1410 * entry (null if none). Else, *lastxp will be set to the index
1411 * of the found entry; *gotp will contain the entry.
1412 */
1413STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
1414xfs_bmap_search_multi_extents(
1415 xfs_ifork_t *ifp, /* inode fork pointer */
1416 xfs_fileoff_t bno, /* block number searched for */
1417 int *eofp, /* out: end of file found */
1418 xfs_extnum_t *lastxp, /* out: last extent index */
1419 xfs_bmbt_irec_t *gotp, /* out: extent entry found */
1420 xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
1421{
1422 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
1423 xfs_extnum_t lastx; /* last extent index */
1424
1425 /*
1426 * Initialize the extent entry structure to catch access to
1427 * uninitialized br_startblock field.
1428 */
1429 gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
1430 gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
1431 gotp->br_state = XFS_EXT_INVALID;
1432#if XFS_BIG_BLKNOS
1433 gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
1434#else
1435 gotp->br_startblock = 0xffffa5a5;
1436#endif
1437 prevp->br_startoff = NULLFILEOFF;
1438
1439 ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
1440 if (lastx > 0) {
1441 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
1442 }
1443 if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
1444 xfs_bmbt_get_all(ep, gotp);
1445 *eofp = 0;
1446 } else {
1447 if (lastx > 0) {
1448 *gotp = *prevp;
1449 }
1450 *eofp = 1;
1451 ep = NULL;
1452 }
1453 *lastxp = lastx;
1454 return ep;
1455}
1456
1457/*
1458 * Search the extents list for the inode, for the extent containing bno.
1459 * If bno lies in a hole, point to the next entry. If bno lies past eof,
1460 * *eofp will be set, and *prevp will contain the last entry (null if none).
1461 * Else, *lastxp will be set to the index of the found
1462 * entry; *gotp will contain the entry.
1463 */
1464STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
1465xfs_bmap_search_extents(
1466 xfs_inode_t *ip, /* incore inode pointer */
1467 xfs_fileoff_t bno, /* block number searched for */
1468 int fork, /* data or attr fork */
1469 int *eofp, /* out: end of file found */
1470 xfs_extnum_t *lastxp, /* out: last extent index */
1471 xfs_bmbt_irec_t *gotp, /* out: extent entry found */
1472 xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
1473{
1474 xfs_ifork_t *ifp; /* inode fork pointer */
1475 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
1476
1477 XFS_STATS_INC(xs_look_exlist);
1478 ifp = XFS_IFORK_PTR(ip, fork);
1479
1480 ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
1481
1482 if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
1483 !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
1484 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
1485 "Access to block zero in inode %llu "
1486 "start_block: %llx start_off: %llx "
1487 "blkcnt: %llx extent-state: %x lastx: %x",
1488 (unsigned long long)ip->i_ino,
1489 (unsigned long long)gotp->br_startblock,
1490 (unsigned long long)gotp->br_startoff,
1491 (unsigned long long)gotp->br_blockcount,
1492 gotp->br_state, *lastxp);
1493 *lastxp = NULLEXTNUM;
1494 *eofp = 1;
1495 return NULL;
1496 }
1497 return ep;
1498}
1499
1500/*
1501 * Returns the file-relative block number of the first unused block(s)
1502 * in the file with at least "len" logically contiguous blocks free.
1503 * This is the lowest-address hole if the file has holes, else the first block
1504 * past the end of file.
1505 * Return 0 if the file is currently local (in-inode).
1506 */
1507int /* error */
1508xfs_bmap_first_unused(
1509 xfs_trans_t *tp, /* transaction pointer */
1510 xfs_inode_t *ip, /* incore inode */
1511 xfs_extlen_t len, /* size of hole to find */
1512 xfs_fileoff_t *first_unused, /* unused block */
1513 int whichfork) /* data or attr fork */
1514{
1515 int error; /* error return value */
1516 int idx; /* extent record index */
1517 xfs_ifork_t *ifp; /* inode fork pointer */
1518 xfs_fileoff_t lastaddr; /* last block number seen */
1519 xfs_fileoff_t lowest; /* lowest useful block */
1520 xfs_fileoff_t max; /* starting useful block */
1521 xfs_fileoff_t off; /* offset for this block */
1522 xfs_extnum_t nextents; /* number of extent entries */
1523
1524 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
1525 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
1526 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
1527 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
1528 *first_unused = 0;
1529 return 0;
1530 }
1531 ifp = XFS_IFORK_PTR(ip, whichfork);
1532 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
1533 (error = xfs_iread_extents(tp, ip, whichfork)))
1534 return error;
1535 lowest = *first_unused;
1536 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1537 for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
1538 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
1539 off = xfs_bmbt_get_startoff(ep);
1540 /*
1541 * See if the hole before this extent will work.
1542 */
1543 if (off >= lowest + len && off - max >= len) {
1544 *first_unused = max;
1545 return 0;
1546 }
1547 lastaddr = off + xfs_bmbt_get_blockcount(ep);
1548 max = XFS_FILEOFF_MAX(lastaddr, lowest);
1549 }
1550 *first_unused = max;
1551 return 0;
1552}
1553
1554/*
1555 * Returns the file-relative block number of the last block - 1 before
1556 * last_block (input value) in the file.
1557 * This is not based on i_size, it is based on the extent records.
1558 * Returns 0 for local files, as they do not have extent records.
1559 */
1560int /* error */
1561xfs_bmap_last_before(
1562 xfs_trans_t *tp, /* transaction pointer */
1563 xfs_inode_t *ip, /* incore inode */
1564 xfs_fileoff_t *last_block, /* last block */
1565 int whichfork) /* data or attr fork */
1566{
1567 xfs_fileoff_t bno; /* input file offset */
1568 int eof; /* hit end of file */
1569 xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
1570 int error; /* error return value */
1571 xfs_bmbt_irec_t got; /* current extent value */
1572 xfs_ifork_t *ifp; /* inode fork pointer */
1573 xfs_extnum_t lastx; /* last extent used */
1574 xfs_bmbt_irec_t prev; /* previous extent value */
1575
1576 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
1577 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
1578 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
1579 return -EIO;
1580 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
1581 *last_block = 0;
1582 return 0;
1583 }
1584 ifp = XFS_IFORK_PTR(ip, whichfork);
1585 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
1586 (error = xfs_iread_extents(tp, ip, whichfork)))
1587 return error;
1588 bno = *last_block - 1;
1589 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
1590 &prev);
1591 if (eof || xfs_bmbt_get_startoff(ep) > bno) {
1592 if (prev.br_startoff == NULLFILEOFF)
1593 *last_block = 0;
1594 else
1595 *last_block = prev.br_startoff + prev.br_blockcount;
1596 }
1597 /*
1598 * Otherwise *last_block is already the right answer.
1599 */
1600 return 0;
1601}
1602
1603int
1604xfs_bmap_last_extent(
1605 struct xfs_trans *tp,
1606 struct xfs_inode *ip,
1607 int whichfork,
1608 struct xfs_bmbt_irec *rec,
1609 int *is_empty)
1610{
1611 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
1612 int error;
1613 int nextents;
1614
1615 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
1616 error = xfs_iread_extents(tp, ip, whichfork);
1617 if (error)
1618 return error;
1619 }
1620
1621 nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
1622 if (nextents == 0) {
1623 *is_empty = 1;
1624 return 0;
1625 }
1626
1627 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
1628 *is_empty = 0;
1629 return 0;
1630}
1631
1632/*
1633 * Check the last inode extent to determine whether this allocation will result
1634 * in blocks being allocated at the end of the file. When we allocate new data
1635 * blocks at the end of the file which do not start at the previous data block,
1636 * we will try to align the new blocks at stripe unit boundaries.
1637 *
1638 * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
1639 * at, or past the EOF.
1640 */
1641STATIC int
1642xfs_bmap_isaeof(
1643 struct xfs_bmalloca *bma,
1644 int whichfork)
1645{
1646 struct xfs_bmbt_irec rec;
1647 int is_empty;
1648 int error;
1649
1650 bma->aeof = 0;
1651 error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
1652 &is_empty);
1653 if (error)
1654 return error;
1655
1656 if (is_empty) {
1657 bma->aeof = 1;
1658 return 0;
1659 }
1660
1661 /*
1662 * Check if we are allocation or past the last extent, or at least into
1663 * the last delayed allocated extent.
1664 */
1665 bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
1666 (bma->offset >= rec.br_startoff &&
1667 isnullstartblock(rec.br_startblock));
1668 return 0;
1669}
1670
1671/*
1672 * Returns the file-relative block number of the first block past eof in
1673 * the file. This is not based on i_size, it is based on the extent records.
1674 * Returns 0 for local files, as they do not have extent records.
1675 */
1676int
1677xfs_bmap_last_offset(
1678 struct xfs_inode *ip,
1679 xfs_fileoff_t *last_block,
1680 int whichfork)
1681{
1682 struct xfs_bmbt_irec rec;
1683 int is_empty;
1684 int error;
1685
1686 *last_block = 0;
1687
1688 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
1689 return 0;
1690
1691 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
1692 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
1693 return -EIO;
1694
1695 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
1696 if (error || is_empty)
1697 return error;
1698
1699 *last_block = rec.br_startoff + rec.br_blockcount;
1700 return 0;
1701}
1702
1703/*
1704 * Returns whether the selected fork of the inode has exactly one
1705 * block or not. For the data fork we check this matches di_size,
1706 * implying the file's range is 0..bsize-1.
1707 */
1708int /* 1=>1 block, 0=>otherwise */
1709xfs_bmap_one_block(
1710 xfs_inode_t *ip, /* incore inode */
1711 int whichfork) /* data or attr fork */
1712{
1713 xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */
1714 xfs_ifork_t *ifp; /* inode fork pointer */
1715 int rval; /* return value */
1716 xfs_bmbt_irec_t s; /* internal version of extent */
1717
1718#ifndef DEBUG
1719 if (whichfork == XFS_DATA_FORK)
1720 return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
1721#endif /* !DEBUG */
1722 if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
1723 return 0;
1724 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
1725 return 0;
1726 ifp = XFS_IFORK_PTR(ip, whichfork);
1727 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
1728 ep = xfs_iext_get_ext(ifp, 0);
1729 xfs_bmbt_get_all(ep, &s);
1730 rval = s.br_startoff == 0 && s.br_blockcount == 1;
1731 if (rval && whichfork == XFS_DATA_FORK)
1732 ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
1733 return rval;
1734}
1735
1736/*
1737 * Extent tree manipulation functions used during allocation.
1738 */
1739
1740/*
1741 * Convert a delayed allocation to a real allocation.
1742 */
1743STATIC int /* error */
1744xfs_bmap_add_extent_delay_real(
1745 struct xfs_bmalloca *bma)
1746{
1747 struct xfs_bmbt_irec *new = &bma->got;
1748 int diff; /* temp value */
1749 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
1750 int error; /* error return value */
1751 int i; /* temp state */
1752 xfs_ifork_t *ifp; /* inode fork pointer */
1753 xfs_fileoff_t new_endoff; /* end offset of new entry */
1754 xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
1755 /* left is 0, right is 1, prev is 2 */
1756 int rval=0; /* return value (logging flags) */
1757 int state = 0;/* state bits, accessed thru macros */
1758 xfs_filblks_t da_new; /* new count del alloc blocks used */
1759 xfs_filblks_t da_old; /* old count del alloc blocks used */
1760 xfs_filblks_t temp=0; /* value for da_new calculations */
1761 xfs_filblks_t temp2=0;/* value for da_new calculations */
1762 int tmp_rval; /* partial logging flags */
1763
1764 ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
1765
1766 ASSERT(bma->idx >= 0);
1767 ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
1768 ASSERT(!isnullstartblock(new->br_startblock));
1769 ASSERT(!bma->cur ||
1770 (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
1771
1772 XFS_STATS_INC(xs_add_exlist);
1773
1774#define LEFT r[0]
1775#define RIGHT r[1]
1776#define PREV r[2]
1777
1778 /*
1779 * Set up a bunch of variables to make the tests simpler.
1780 */
1781 ep = xfs_iext_get_ext(ifp, bma->idx);
1782 xfs_bmbt_get_all(ep, &PREV);
1783 new_endoff = new->br_startoff + new->br_blockcount;
1784 ASSERT(PREV.br_startoff <= new->br_startoff);
1785 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
1786
1787 da_old = startblockval(PREV.br_startblock);
1788 da_new = 0;
1789
1790 /*
1791 * Set flags determining what part of the previous delayed allocation
1792 * extent is being replaced by a real allocation.
1793 */
1794 if (PREV.br_startoff == new->br_startoff)
1795 state |= BMAP_LEFT_FILLING;
1796 if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
1797 state |= BMAP_RIGHT_FILLING;
1798
1799 /*
1800 * Check and set flags if this segment has a left neighbor.
1801 * Don't set contiguous if the combined extent would be too large.
1802 */
1803 if (bma->idx > 0) {
1804 state |= BMAP_LEFT_VALID;
1805 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
1806
1807 if (isnullstartblock(LEFT.br_startblock))
1808 state |= BMAP_LEFT_DELAY;
1809 }
1810
1811 if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
1812 LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
1813 LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
1814 LEFT.br_state == new->br_state &&
1815 LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
1816 state |= BMAP_LEFT_CONTIG;
1817
1818 /*
1819 * Check and set flags if this segment has a right neighbor.
1820 * Don't set contiguous if the combined extent would be too large.
1821 * Also check for all-three-contiguous being too large.
1822 */
1823 if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
1824 state |= BMAP_RIGHT_VALID;
1825 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
1826
1827 if (isnullstartblock(RIGHT.br_startblock))
1828 state |= BMAP_RIGHT_DELAY;
1829 }
1830
1831 if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
1832 new_endoff == RIGHT.br_startoff &&
1833 new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
1834 new->br_state == RIGHT.br_state &&
1835 new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
1836 ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
1837 BMAP_RIGHT_FILLING)) !=
1838 (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
1839 BMAP_RIGHT_FILLING) ||
1840 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
1841 <= MAXEXTLEN))
1842 state |= BMAP_RIGHT_CONTIG;
1843
1844 error = 0;
1845 /*
1846 * Switch out based on the FILLING and CONTIG state bits.
1847 */
1848 switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
1849 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
1850 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
1851 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
1852 /*
1853 * Filling in all of a previously delayed allocation extent.
1854 * The left and right neighbors are both contiguous with new.
1855 */
1856 bma->idx--;
1857 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1858 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
1859 LEFT.br_blockcount + PREV.br_blockcount +
1860 RIGHT.br_blockcount);
1861 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1862
1863 xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
1864 bma->ip->i_d.di_nextents--;
1865 if (bma->cur == NULL)
1866 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1867 else {
1868 rval = XFS_ILOG_CORE;
1869 error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
1870 RIGHT.br_startblock,
1871 RIGHT.br_blockcount, &i);
1872 if (error)
1873 goto done;
1874 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1875 error = xfs_btree_delete(bma->cur, &i);
1876 if (error)
1877 goto done;
1878 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1879 error = xfs_btree_decrement(bma->cur, 0, &i);
1880 if (error)
1881 goto done;
1882 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1883 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
1884 LEFT.br_startblock,
1885 LEFT.br_blockcount +
1886 PREV.br_blockcount +
1887 RIGHT.br_blockcount, LEFT.br_state);
1888 if (error)
1889 goto done;
1890 }
1891 break;
1892
1893 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
1894 /*
1895 * Filling in all of a previously delayed allocation extent.
1896 * The left neighbor is contiguous, the right is not.
1897 */
1898 bma->idx--;
1899
1900 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1901 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
1902 LEFT.br_blockcount + PREV.br_blockcount);
1903 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1904
1905 xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
1906 if (bma->cur == NULL)
1907 rval = XFS_ILOG_DEXT;
1908 else {
1909 rval = 0;
1910 error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
1911 LEFT.br_startblock, LEFT.br_blockcount,
1912 &i);
1913 if (error)
1914 goto done;
1915 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1916 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
1917 LEFT.br_startblock,
1918 LEFT.br_blockcount +
1919 PREV.br_blockcount, LEFT.br_state);
1920 if (error)
1921 goto done;
1922 }
1923 break;
1924
1925 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
1926 /*
1927 * Filling in all of a previously delayed allocation extent.
1928 * The right neighbor is contiguous, the left is not.
1929 */
1930 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1931 xfs_bmbt_set_startblock(ep, new->br_startblock);
1932 xfs_bmbt_set_blockcount(ep,
1933 PREV.br_blockcount + RIGHT.br_blockcount);
1934 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1935
1936 xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
1937 if (bma->cur == NULL)
1938 rval = XFS_ILOG_DEXT;
1939 else {
1940 rval = 0;
1941 error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
1942 RIGHT.br_startblock,
1943 RIGHT.br_blockcount, &i);
1944 if (error)
1945 goto done;
1946 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1947 error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
1948 new->br_startblock,
1949 PREV.br_blockcount +
1950 RIGHT.br_blockcount, PREV.br_state);
1951 if (error)
1952 goto done;
1953 }
1954 break;
1955
1956 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
1957 /*
1958 * Filling in all of a previously delayed allocation extent.
1959 * Neither the left nor right neighbors are contiguous with
1960 * the new one.
1961 */
1962 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1963 xfs_bmbt_set_startblock(ep, new->br_startblock);
1964 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1965
1966 bma->ip->i_d.di_nextents++;
1967 if (bma->cur == NULL)
1968 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
1969 else {
1970 rval = XFS_ILOG_CORE;
1971 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
1972 new->br_startblock, new->br_blockcount,
1973 &i);
1974 if (error)
1975 goto done;
1976 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
1977 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
1978 error = xfs_btree_insert(bma->cur, &i);
1979 if (error)
1980 goto done;
1981 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
1982 }
1983 break;
1984
1985 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
1986 /*
1987 * Filling in the first part of a previous delayed allocation.
1988 * The left neighbor is contiguous.
1989 */
1990 trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
1991 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
1992 LEFT.br_blockcount + new->br_blockcount);
1993 xfs_bmbt_set_startoff(ep,
1994 PREV.br_startoff + new->br_blockcount);
1995 trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
1996
1997 temp = PREV.br_blockcount - new->br_blockcount;
1998 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1999 xfs_bmbt_set_blockcount(ep, temp);
2000 if (bma->cur == NULL)
2001 rval = XFS_ILOG_DEXT;
2002 else {
2003 rval = 0;
2004 error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
2005 LEFT.br_startblock, LEFT.br_blockcount,
2006 &i);
2007 if (error)
2008 goto done;
2009 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2010 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
2011 LEFT.br_startblock,
2012 LEFT.br_blockcount +
2013 new->br_blockcount,
2014 LEFT.br_state);
2015 if (error)
2016 goto done;
2017 }
2018 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
2019 startblockval(PREV.br_startblock));
2020 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
2021 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
2022
2023 bma->idx--;
2024 break;
2025
2026 case BMAP_LEFT_FILLING:
2027 /*
2028 * Filling in the first part of a previous delayed allocation.
2029 * The left neighbor is not contiguous.
2030 */
2031 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
2032 xfs_bmbt_set_startoff(ep, new_endoff);
2033 temp = PREV.br_blockcount - new->br_blockcount;
2034 xfs_bmbt_set_blockcount(ep, temp);
2035 xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
2036 bma->ip->i_d.di_nextents++;
2037 if (bma->cur == NULL)
2038 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2039 else {
2040 rval = XFS_ILOG_CORE;
2041 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
2042 new->br_startblock, new->br_blockcount,
2043 &i);
2044 if (error)
2045 goto done;
2046 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
2047 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
2048 error = xfs_btree_insert(bma->cur, &i);
2049 if (error)
2050 goto done;
2051 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2052 }
2053
2054 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
2055 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
2056 bma->firstblock, bma->flist,
2057 &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
2058 rval |= tmp_rval;
2059 if (error)
2060 goto done;
2061 }
2062 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
2063 startblockval(PREV.br_startblock) -
2064 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
2065 ep = xfs_iext_get_ext(ifp, bma->idx + 1);
2066 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
2067 trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
2068 break;
2069
2070 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
2071 /*
2072 * Filling in the last part of a previous delayed allocation.
2073 * The right neighbor is contiguous with the new allocation.
2074 */
2075 temp = PREV.br_blockcount - new->br_blockcount;
2076 trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
2077 xfs_bmbt_set_blockcount(ep, temp);
2078 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
2079 new->br_startoff, new->br_startblock,
2080 new->br_blockcount + RIGHT.br_blockcount,
2081 RIGHT.br_state);
2082 trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
2083 if (bma->cur == NULL)
2084 rval = XFS_ILOG_DEXT;
2085 else {
2086 rval = 0;
2087 error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
2088 RIGHT.br_startblock,
2089 RIGHT.br_blockcount, &i);
2090 if (error)
2091 goto done;
2092 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2093 error = xfs_bmbt_update(bma->cur, new->br_startoff,
2094 new->br_startblock,
2095 new->br_blockcount +
2096 RIGHT.br_blockcount,
2097 RIGHT.br_state);
2098 if (error)
2099 goto done;
2100 }
2101
2102 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
2103 startblockval(PREV.br_startblock));
2104 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
2105 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
2106 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
2107
2108 bma->idx++;
2109 break;
2110
2111 case BMAP_RIGHT_FILLING:
2112 /*
2113 * Filling in the last part of a previous delayed allocation.
2114 * The right neighbor is not contiguous.
2115 */
2116 temp = PREV.br_blockcount - new->br_blockcount;
2117 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
2118 xfs_bmbt_set_blockcount(ep, temp);
2119 xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
2120 bma->ip->i_d.di_nextents++;
2121 if (bma->cur == NULL)
2122 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2123 else {
2124 rval = XFS_ILOG_CORE;
2125 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
2126 new->br_startblock, new->br_blockcount,
2127 &i);
2128 if (error)
2129 goto done;
2130 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
2131 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
2132 error = xfs_btree_insert(bma->cur, &i);
2133 if (error)
2134 goto done;
2135 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2136 }
2137
2138 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
2139 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
2140 bma->firstblock, bma->flist, &bma->cur, 1,
2141 &tmp_rval, XFS_DATA_FORK);
2142 rval |= tmp_rval;
2143 if (error)
2144 goto done;
2145 }
2146 da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
2147 startblockval(PREV.br_startblock) -
2148 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
2149 ep = xfs_iext_get_ext(ifp, bma->idx);
2150 xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
2151 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
2152
2153 bma->idx++;
2154 break;
2155
2156 case 0:
2157 /*
2158 * Filling in the middle part of a previous delayed allocation.
2159 * Contiguity is impossible here.
2160 * This case is avoided almost all the time.
2161 *
2162 * We start with a delayed allocation:
2163 *
2164 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
2165 * PREV @ idx
2166 *
2167 * and we are allocating:
2168 * +rrrrrrrrrrrrrrrrr+
2169 * new
2170 *
2171 * and we set it up for insertion as:
2172 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
2173 * new
2174 * PREV @ idx LEFT RIGHT
2175 * inserted at idx + 1
2176 */
2177 temp = new->br_startoff - PREV.br_startoff;
2178 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
2179 trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
2180 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
2181 LEFT = *new;
2182 RIGHT.br_state = PREV.br_state;
2183 RIGHT.br_startblock = nullstartblock(
2184 (int)xfs_bmap_worst_indlen(bma->ip, temp2));
2185 RIGHT.br_startoff = new_endoff;
2186 RIGHT.br_blockcount = temp2;
2187 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
2188 xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
2189 bma->ip->i_d.di_nextents++;
2190 if (bma->cur == NULL)
2191 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2192 else {
2193 rval = XFS_ILOG_CORE;
2194 error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
2195 new->br_startblock, new->br_blockcount,
2196 &i);
2197 if (error)
2198 goto done;
2199 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
2200 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
2201 error = xfs_btree_insert(bma->cur, &i);
2202 if (error)
2203 goto done;
2204 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2205 }
2206
2207 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
2208 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
2209 bma->firstblock, bma->flist, &bma->cur,
2210 1, &tmp_rval, XFS_DATA_FORK);
2211 rval |= tmp_rval;
2212 if (error)
2213 goto done;
2214 }
2215 temp = xfs_bmap_worst_indlen(bma->ip, temp);
2216 temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
2217 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
2218 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
2219 if (diff > 0) {
2220 error = xfs_icsb_modify_counters(bma->ip->i_mount,
2221 XFS_SBS_FDBLOCKS,
2222 -((int64_t)diff), 0);
2223 ASSERT(!error);
2224 if (error)
2225 goto done;
2226 }
2227
2228 ep = xfs_iext_get_ext(ifp, bma->idx);
2229 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
2230 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
2231 trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
2232 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
2233 nullstartblock((int)temp2));
2234 trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
2235
2236 bma->idx++;
2237 da_new = temp + temp2;
2238 break;
2239
2240 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
2241 case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
2242 case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
2243 case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
2244 case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
2245 case BMAP_LEFT_CONTIG:
2246 case BMAP_RIGHT_CONTIG:
2247 /*
2248 * These cases are all impossible.
2249 */
2250 ASSERT(0);
2251 }
2252
2253 /* convert to a btree if necessary */
2254 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
2255 int tmp_logflags; /* partial log flag return val */
2256
2257 ASSERT(bma->cur == NULL);
2258 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
2259 bma->firstblock, bma->flist, &bma->cur,
2260 da_old > 0, &tmp_logflags, XFS_DATA_FORK);
2261 bma->logflags |= tmp_logflags;
2262 if (error)
2263 goto done;
2264 }
2265
2266 /* adjust for changes in reserved delayed indirect blocks */
2267 if (da_old || da_new) {
2268 temp = da_new;
2269 if (bma->cur)
2270 temp += bma->cur->bc_private.b.allocated;
2271 ASSERT(temp <= da_old);
2272 if (temp < da_old)
2273 xfs_icsb_modify_counters(bma->ip->i_mount,
2274 XFS_SBS_FDBLOCKS,
2275 (int64_t)(da_old - temp), 0);
2276 }
2277
2278 /* clear out the allocated field, done with it now in any case. */
2279 if (bma->cur)
2280 bma->cur->bc_private.b.allocated = 0;
2281
2282 xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
2283done:
2284 bma->logflags |= rval;
2285 return error;
2286#undef LEFT
2287#undef RIGHT
2288#undef PREV
2289}
2290
2291/*
2292 * Convert an unwritten allocation to a real allocation or vice versa.
2293 */
2294STATIC int /* error */
2295xfs_bmap_add_extent_unwritten_real(
2296 struct xfs_trans *tp,
2297 xfs_inode_t *ip, /* incore inode pointer */
2298 xfs_extnum_t *idx, /* extent number to update/insert */
2299 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
2300 xfs_bmbt_irec_t *new, /* new data to add to file extents */
2301 xfs_fsblock_t *first, /* pointer to firstblock variable */
2302 xfs_bmap_free_t *flist, /* list of extents to be freed */
2303 int *logflagsp) /* inode logging flags */
2304{
2305 xfs_btree_cur_t *cur; /* btree cursor */
2306 xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
2307 int error; /* error return value */
2308 int i; /* temp state */
2309 xfs_ifork_t *ifp; /* inode fork pointer */
2310 xfs_fileoff_t new_endoff; /* end offset of new entry */
2311 xfs_exntst_t newext; /* new extent state */
2312 xfs_exntst_t oldext; /* old extent state */
2313 xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
2314 /* left is 0, right is 1, prev is 2 */
2315 int rval=0; /* return value (logging flags) */
2316 int state = 0;/* state bits, accessed thru macros */
2317
2318 *logflagsp = 0;
2319
2320 cur = *curp;
2321 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
2322
2323 ASSERT(*idx >= 0);
2324 ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
2325 ASSERT(!isnullstartblock(new->br_startblock));
2326
2327 XFS_STATS_INC(xs_add_exlist);
2328
2329#define LEFT r[0]
2330#define RIGHT r[1]
2331#define PREV r[2]
2332
2333 /*
2334 * Set up a bunch of variables to make the tests simpler.
2335 */
2336 error = 0;
2337 ep = xfs_iext_get_ext(ifp, *idx);
2338 xfs_bmbt_get_all(ep, &PREV);
2339 newext = new->br_state;
2340 oldext = (newext == XFS_EXT_UNWRITTEN) ?
2341 XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
2342 ASSERT(PREV.br_state == oldext);
2343 new_endoff = new->br_startoff + new->br_blockcount;
2344 ASSERT(PREV.br_startoff <= new->br_startoff);
2345 ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
2346
2347 /*
2348 * Set flags determining what part of the previous oldext allocation
2349 * extent is being replaced by a newext allocation.
2350 */
2351 if (PREV.br_startoff == new->br_startoff)
2352 state |= BMAP_LEFT_FILLING;
2353 if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
2354 state |= BMAP_RIGHT_FILLING;
2355
2356 /*
2357 * Check and set flags if this segment has a left neighbor.
2358 * Don't set contiguous if the combined extent would be too large.
2359 */
2360 if (*idx > 0) {
2361 state |= BMAP_LEFT_VALID;
2362 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
2363
2364 if (isnullstartblock(LEFT.br_startblock))
2365 state |= BMAP_LEFT_DELAY;
2366 }
2367
2368 if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
2369 LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
2370 LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
2371 LEFT.br_state == newext &&
2372 LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
2373 state |= BMAP_LEFT_CONTIG;
2374
2375 /*
2376 * Check and set flags if this segment has a right neighbor.
2377 * Don't set contiguous if the combined extent would be too large.
2378 * Also check for all-three-contiguous being too large.
2379 */
2380 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
2381 state |= BMAP_RIGHT_VALID;
2382 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
2383 if (isnullstartblock(RIGHT.br_startblock))
2384 state |= BMAP_RIGHT_DELAY;
2385 }
2386
2387 if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
2388 new_endoff == RIGHT.br_startoff &&
2389 new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
2390 newext == RIGHT.br_state &&
2391 new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
2392 ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
2393 BMAP_RIGHT_FILLING)) !=
2394 (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
2395 BMAP_RIGHT_FILLING) ||
2396 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
2397 <= MAXEXTLEN))
2398 state |= BMAP_RIGHT_CONTIG;
2399
2400 /*
2401 * Switch out based on the FILLING and CONTIG state bits.
2402 */
2403 switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
2404 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
2405 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
2406 BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
2407 /*
2408 * Setting all of a previous oldext extent to newext.
2409 * The left and right neighbors are both contiguous with new.
2410 */
2411 --*idx;
2412
2413 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2414 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
2415 LEFT.br_blockcount + PREV.br_blockcount +
2416 RIGHT.br_blockcount);
2417 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2418
2419 xfs_iext_remove(ip, *idx + 1, 2, state);
2420 ip->i_d.di_nextents -= 2;
2421 if (cur == NULL)
2422 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2423 else {
2424 rval = XFS_ILOG_CORE;
2425 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
2426 RIGHT.br_startblock,
2427 RIGHT.br_blockcount, &i)))
2428 goto done;
2429 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2430 if ((error = xfs_btree_delete(cur, &i)))
2431 goto done;
2432 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2433 if ((error = xfs_btree_decrement(cur, 0, &i)))
2434 goto done;
2435 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2436 if ((error = xfs_btree_delete(cur, &i)))
2437 goto done;
2438 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2439 if ((error = xfs_btree_decrement(cur, 0, &i)))
2440 goto done;
2441 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2442 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
2443 LEFT.br_startblock,
2444 LEFT.br_blockcount + PREV.br_blockcount +
2445 RIGHT.br_blockcount, LEFT.br_state)))
2446 goto done;
2447 }
2448 break;
2449
2450 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
2451 /*
2452 * Setting all of a previous oldext extent to newext.
2453 * The left neighbor is contiguous, the right is not.
2454 */
2455 --*idx;
2456
2457 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2458 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
2459 LEFT.br_blockcount + PREV.br_blockcount);
2460 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2461
2462 xfs_iext_remove(ip, *idx + 1, 1, state);
2463 ip->i_d.di_nextents--;
2464 if (cur == NULL)
2465 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2466 else {
2467 rval = XFS_ILOG_CORE;
2468 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
2469 PREV.br_startblock, PREV.br_blockcount,
2470 &i)))
2471 goto done;
2472 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2473 if ((error = xfs_btree_delete(cur, &i)))
2474 goto done;
2475 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2476 if ((error = xfs_btree_decrement(cur, 0, &i)))
2477 goto done;
2478 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2479 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
2480 LEFT.br_startblock,
2481 LEFT.br_blockcount + PREV.br_blockcount,
2482 LEFT.br_state)))
2483 goto done;
2484 }
2485 break;
2486
2487 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
2488 /*
2489 * Setting all of a previous oldext extent to newext.
2490 * The right neighbor is contiguous, the left is not.
2491 */
2492 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2493 xfs_bmbt_set_blockcount(ep,
2494 PREV.br_blockcount + RIGHT.br_blockcount);
2495 xfs_bmbt_set_state(ep, newext);
2496 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2497 xfs_iext_remove(ip, *idx + 1, 1, state);
2498 ip->i_d.di_nextents--;
2499 if (cur == NULL)
2500 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2501 else {
2502 rval = XFS_ILOG_CORE;
2503 if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
2504 RIGHT.br_startblock,
2505 RIGHT.br_blockcount, &i)))
2506 goto done;
2507 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2508 if ((error = xfs_btree_delete(cur, &i)))
2509 goto done;
2510 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2511 if ((error = xfs_btree_decrement(cur, 0, &i)))
2512 goto done;
2513 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2514 if ((error = xfs_bmbt_update(cur, new->br_startoff,
2515 new->br_startblock,
2516 new->br_blockcount + RIGHT.br_blockcount,
2517 newext)))
2518 goto done;
2519 }
2520 break;
2521
2522 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
2523 /*
2524 * Setting all of a previous oldext extent to newext.
2525 * Neither the left nor right neighbors are contiguous with
2526 * the new one.
2527 */
2528 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2529 xfs_bmbt_set_state(ep, newext);
2530 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2531
2532 if (cur == NULL)
2533 rval = XFS_ILOG_DEXT;
2534 else {
2535 rval = 0;
2536 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
2537 new->br_startblock, new->br_blockcount,
2538 &i)))
2539 goto done;
2540 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2541 if ((error = xfs_bmbt_update(cur, new->br_startoff,
2542 new->br_startblock, new->br_blockcount,
2543 newext)))
2544 goto done;
2545 }
2546 break;
2547
2548 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
2549 /*
2550 * Setting the first part of a previous oldext extent to newext.
2551 * The left neighbor is contiguous.
2552 */
2553 trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
2554 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
2555 LEFT.br_blockcount + new->br_blockcount);
2556 xfs_bmbt_set_startoff(ep,
2557 PREV.br_startoff + new->br_blockcount);
2558 trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
2559
2560 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2561 xfs_bmbt_set_startblock(ep,
2562 new->br_startblock + new->br_blockcount);
2563 xfs_bmbt_set_blockcount(ep,
2564 PREV.br_blockcount - new->br_blockcount);
2565 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2566
2567 --*idx;
2568
2569 if (cur == NULL)
2570 rval = XFS_ILOG_DEXT;
2571 else {
2572 rval = 0;
2573 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
2574 PREV.br_startblock, PREV.br_blockcount,
2575 &i)))
2576 goto done;
2577 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2578 if ((error = xfs_bmbt_update(cur,
2579 PREV.br_startoff + new->br_blockcount,
2580 PREV.br_startblock + new->br_blockcount,
2581 PREV.br_blockcount - new->br_blockcount,
2582 oldext)))
2583 goto done;
2584 if ((error = xfs_btree_decrement(cur, 0, &i)))
2585 goto done;
2586 error = xfs_bmbt_update(cur, LEFT.br_startoff,
2587 LEFT.br_startblock,
2588 LEFT.br_blockcount + new->br_blockcount,
2589 LEFT.br_state);
2590 if (error)
2591 goto done;
2592 }
2593 break;
2594
2595 case BMAP_LEFT_FILLING:
2596 /*
2597 * Setting the first part of a previous oldext extent to newext.
2598 * The left neighbor is not contiguous.
2599 */
2600 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2601 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
2602 xfs_bmbt_set_startoff(ep, new_endoff);
2603 xfs_bmbt_set_blockcount(ep,
2604 PREV.br_blockcount - new->br_blockcount);
2605 xfs_bmbt_set_startblock(ep,
2606 new->br_startblock + new->br_blockcount);
2607 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2608
2609 xfs_iext_insert(ip, *idx, 1, new, state);
2610 ip->i_d.di_nextents++;
2611 if (cur == NULL)
2612 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2613 else {
2614 rval = XFS_ILOG_CORE;
2615 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
2616 PREV.br_startblock, PREV.br_blockcount,
2617 &i)))
2618 goto done;
2619 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2620 if ((error = xfs_bmbt_update(cur,
2621 PREV.br_startoff + new->br_blockcount,
2622 PREV.br_startblock + new->br_blockcount,
2623 PREV.br_blockcount - new->br_blockcount,
2624 oldext)))
2625 goto done;
2626 cur->bc_rec.b = *new;
2627 if ((error = xfs_btree_insert(cur, &i)))
2628 goto done;
2629 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2630 }
2631 break;
2632
2633 case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
2634 /*
2635 * Setting the last part of a previous oldext extent to newext.
2636 * The right neighbor is contiguous with the new allocation.
2637 */
2638 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2639 xfs_bmbt_set_blockcount(ep,
2640 PREV.br_blockcount - new->br_blockcount);
2641 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2642
2643 ++*idx;
2644
2645 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2646 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
2647 new->br_startoff, new->br_startblock,
2648 new->br_blockcount + RIGHT.br_blockcount, newext);
2649 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2650
2651 if (cur == NULL)
2652 rval = XFS_ILOG_DEXT;
2653 else {
2654 rval = 0;
2655 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
2656 PREV.br_startblock,
2657 PREV.br_blockcount, &i)))
2658 goto done;
2659 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2660 if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
2661 PREV.br_startblock,
2662 PREV.br_blockcount - new->br_blockcount,
2663 oldext)))
2664 goto done;
2665 if ((error = xfs_btree_increment(cur, 0, &i)))
2666 goto done;
2667 if ((error = xfs_bmbt_update(cur, new->br_startoff,
2668 new->br_startblock,
2669 new->br_blockcount + RIGHT.br_blockcount,
2670 newext)))
2671 goto done;
2672 }
2673 break;
2674
2675 case BMAP_RIGHT_FILLING:
2676 /*
2677 * Setting the last part of a previous oldext extent to newext.
2678 * The right neighbor is not contiguous.
2679 */
2680 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2681 xfs_bmbt_set_blockcount(ep,
2682 PREV.br_blockcount - new->br_blockcount);
2683 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2684
2685 ++*idx;
2686 xfs_iext_insert(ip, *idx, 1, new, state);
2687
2688 ip->i_d.di_nextents++;
2689 if (cur == NULL)
2690 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2691 else {
2692 rval = XFS_ILOG_CORE;
2693 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
2694 PREV.br_startblock, PREV.br_blockcount,
2695 &i)))
2696 goto done;
2697 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2698 if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
2699 PREV.br_startblock,
2700 PREV.br_blockcount - new->br_blockcount,
2701 oldext)))
2702 goto done;
2703 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
2704 new->br_startblock, new->br_blockcount,
2705 &i)))
2706 goto done;
2707 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
2708 cur->bc_rec.b.br_state = XFS_EXT_NORM;
2709 if ((error = xfs_btree_insert(cur, &i)))
2710 goto done;
2711 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2712 }
2713 break;
2714
2715 case 0:
2716 /*
2717 * Setting the middle part of a previous oldext extent to
2718 * newext. Contiguity is impossible here.
2719 * One extent becomes three extents.
2720 */
2721 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2722 xfs_bmbt_set_blockcount(ep,
2723 new->br_startoff - PREV.br_startoff);
2724 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2725
2726 r[0] = *new;
2727 r[1].br_startoff = new_endoff;
2728 r[1].br_blockcount =
2729 PREV.br_startoff + PREV.br_blockcount - new_endoff;
2730 r[1].br_startblock = new->br_startblock + new->br_blockcount;
2731 r[1].br_state = oldext;
2732
2733 ++*idx;
2734 xfs_iext_insert(ip, *idx, 2, &r[0], state);
2735
2736 ip->i_d.di_nextents += 2;
2737 if (cur == NULL)
2738 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2739 else {
2740 rval = XFS_ILOG_CORE;
2741 if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
2742 PREV.br_startblock, PREV.br_blockcount,
2743 &i)))
2744 goto done;
2745 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2746 /* new right extent - oldext */
2747 if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
2748 r[1].br_startblock, r[1].br_blockcount,
2749 r[1].br_state)))
2750 goto done;
2751 /* new left extent - oldext */
2752 cur->bc_rec.b = PREV;
2753 cur->bc_rec.b.br_blockcount =
2754 new->br_startoff - PREV.br_startoff;
2755 if ((error = xfs_btree_insert(cur, &i)))
2756 goto done;
2757 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2758 /*
2759 * Reset the cursor to the position of the new extent
2760 * we are about to insert as we can't trust it after
2761 * the previous insert.
2762 */
2763 if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
2764 new->br_startblock, new->br_blockcount,
2765 &i)))
2766 goto done;
2767 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
2768 /* new middle extent - newext */
2769 cur->bc_rec.b.br_state = new->br_state;
2770 if ((error = xfs_btree_insert(cur, &i)))
2771 goto done;
2772 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
2773 }
2774 break;
2775
2776 case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
2777 case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
2778 case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
2779 case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
2780 case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
2781 case BMAP_LEFT_CONTIG:
2782 case BMAP_RIGHT_CONTIG:
2783 /*
2784 * These cases are all impossible.
2785 */
2786 ASSERT(0);
2787 }
2788
2789 /* convert to a btree if necessary */
2790 if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
2791 int tmp_logflags; /* partial log flag return val */
2792
2793 ASSERT(cur == NULL);
2794 error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
2795 0, &tmp_logflags, XFS_DATA_FORK);
2796 *logflagsp |= tmp_logflags;
2797 if (error)
2798 goto done;
2799 }
2800
2801 /* clear out the allocated field, done with it now in any case. */
2802 if (cur) {
2803 cur->bc_private.b.allocated = 0;
2804 *curp = cur;
2805 }
2806
2807 xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
2808done:
2809 *logflagsp |= rval;
2810 return error;
2811#undef LEFT
2812#undef RIGHT
2813#undef PREV
2814}
2815
2816/*
2817 * Convert a hole to a delayed allocation.
2818 */
2819STATIC void
2820xfs_bmap_add_extent_hole_delay(
2821 xfs_inode_t *ip, /* incore inode pointer */
2822 xfs_extnum_t *idx, /* extent number to update/insert */
2823 xfs_bmbt_irec_t *new) /* new data to add to file extents */
2824{
2825 xfs_ifork_t *ifp; /* inode fork pointer */
2826 xfs_bmbt_irec_t left; /* left neighbor extent entry */
2827 xfs_filblks_t newlen=0; /* new indirect size */
2828 xfs_filblks_t oldlen=0; /* old indirect size */
2829 xfs_bmbt_irec_t right; /* right neighbor extent entry */
2830 int state; /* state bits, accessed thru macros */
2831 xfs_filblks_t temp=0; /* temp for indirect calculations */
2832
2833 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
2834 state = 0;
2835 ASSERT(isnullstartblock(new->br_startblock));
2836
2837 /*
2838 * Check and set flags if this segment has a left neighbor
2839 */
2840 if (*idx > 0) {
2841 state |= BMAP_LEFT_VALID;
2842 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
2843
2844 if (isnullstartblock(left.br_startblock))
2845 state |= BMAP_LEFT_DELAY;
2846 }
2847
2848 /*
2849 * Check and set flags if the current (right) segment exists.
2850 * If it doesn't exist, we're converting the hole at end-of-file.
2851 */
2852 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
2853 state |= BMAP_RIGHT_VALID;
2854 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
2855
2856 if (isnullstartblock(right.br_startblock))
2857 state |= BMAP_RIGHT_DELAY;
2858 }
2859
2860 /*
2861 * Set contiguity flags on the left and right neighbors.
2862 * Don't let extents get too large, even if the pieces are contiguous.
2863 */
2864 if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
2865 left.br_startoff + left.br_blockcount == new->br_startoff &&
2866 left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
2867 state |= BMAP_LEFT_CONTIG;
2868
2869 if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
2870 new->br_startoff + new->br_blockcount == right.br_startoff &&
2871 new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
2872 (!(state & BMAP_LEFT_CONTIG) ||
2873 (left.br_blockcount + new->br_blockcount +
2874 right.br_blockcount <= MAXEXTLEN)))
2875 state |= BMAP_RIGHT_CONTIG;
2876
2877 /*
2878 * Switch out based on the contiguity flags.
2879 */
2880 switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
2881 case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
2882 /*
2883 * New allocation is contiguous with delayed allocations
2884 * on the left and on the right.
2885 * Merge all three into a single extent record.
2886 */
2887 --*idx;
2888 temp = left.br_blockcount + new->br_blockcount +
2889 right.br_blockcount;
2890
2891 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2892 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
2893 oldlen = startblockval(left.br_startblock) +
2894 startblockval(new->br_startblock) +
2895 startblockval(right.br_startblock);
2896 newlen = xfs_bmap_worst_indlen(ip, temp);
2897 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
2898 nullstartblock((int)newlen));
2899 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2900
2901 xfs_iext_remove(ip, *idx + 1, 1, state);
2902 break;
2903
2904 case BMAP_LEFT_CONTIG:
2905 /*
2906 * New allocation is contiguous with a delayed allocation
2907 * on the left.
2908 * Merge the new allocation with the left neighbor.
2909 */
2910 --*idx;
2911 temp = left.br_blockcount + new->br_blockcount;
2912
2913 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2914 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
2915 oldlen = startblockval(left.br_startblock) +
2916 startblockval(new->br_startblock);
2917 newlen = xfs_bmap_worst_indlen(ip, temp);
2918 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
2919 nullstartblock((int)newlen));
2920 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2921 break;
2922
2923 case BMAP_RIGHT_CONTIG:
2924 /*
2925 * New allocation is contiguous with a delayed allocation
2926 * on the right.
2927 * Merge the new allocation with the right neighbor.
2928 */
2929 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2930 temp = new->br_blockcount + right.br_blockcount;
2931 oldlen = startblockval(new->br_startblock) +
2932 startblockval(right.br_startblock);
2933 newlen = xfs_bmap_worst_indlen(ip, temp);
2934 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
2935 new->br_startoff,
2936 nullstartblock((int)newlen), temp, right.br_state);
2937 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2938 break;
2939
2940 case 0:
2941 /*
2942 * New allocation is not contiguous with another
2943 * delayed allocation.
2944 * Insert a new entry.
2945 */
2946 oldlen = newlen = 0;
2947 xfs_iext_insert(ip, *idx, 1, new, state);
2948 break;
2949 }
2950 if (oldlen != newlen) {
2951 ASSERT(oldlen > newlen);
2952 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
2953 (int64_t)(oldlen - newlen), 0);
2954 /*
2955 * Nothing to do for disk quota accounting here.
2956 */
2957 }
2958}
2959
2960/*
2961 * Convert a hole to a real allocation.
2962 */
2963STATIC int /* error */
2964xfs_bmap_add_extent_hole_real(
2965 struct xfs_bmalloca *bma,
2966 int whichfork)
2967{
2968 struct xfs_bmbt_irec *new = &bma->got;
2969 int error; /* error return value */
2970 int i; /* temp state */
2971 xfs_ifork_t *ifp; /* inode fork pointer */
2972 xfs_bmbt_irec_t left; /* left neighbor extent entry */
2973 xfs_bmbt_irec_t right; /* right neighbor extent entry */
2974 int rval=0; /* return value (logging flags) */
2975 int state; /* state bits, accessed thru macros */
2976
2977 ifp = XFS_IFORK_PTR(bma->ip, whichfork);
2978
2979 ASSERT(bma->idx >= 0);
2980 ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
2981 ASSERT(!isnullstartblock(new->br_startblock));
2982 ASSERT(!bma->cur ||
2983 !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
2984
2985 XFS_STATS_INC(xs_add_exlist);
2986
2987 state = 0;
2988 if (whichfork == XFS_ATTR_FORK)
2989 state |= BMAP_ATTRFORK;
2990
2991 /*
2992 * Check and set flags if this segment has a left neighbor.
2993 */
2994 if (bma->idx > 0) {
2995 state |= BMAP_LEFT_VALID;
2996 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
2997 if (isnullstartblock(left.br_startblock))
2998 state |= BMAP_LEFT_DELAY;
2999 }
3000
3001 /*
3002 * Check and set flags if this segment has a current value.
3003 * Not true if we're inserting into the "hole" at eof.
3004 */
3005 if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
3006 state |= BMAP_RIGHT_VALID;
3007 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
3008 if (isnullstartblock(right.br_startblock))
3009 state |= BMAP_RIGHT_DELAY;
3010 }
3011
3012 /*
3013 * We're inserting a real allocation between "left" and "right".
3014 * Set the contiguity flags. Don't let extents get too large.
3015 */
3016 if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
3017 left.br_startoff + left.br_blockcount == new->br_startoff &&
3018 left.br_startblock + left.br_blockcount == new->br_startblock &&
3019 left.br_state == new->br_state &&
3020 left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
3021 state |= BMAP_LEFT_CONTIG;
3022
3023 if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
3024 new->br_startoff + new->br_blockcount == right.br_startoff &&
3025 new->br_startblock + new->br_blockcount == right.br_startblock &&
3026 new->br_state == right.br_state &&
3027 new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
3028 (!(state & BMAP_LEFT_CONTIG) ||
3029 left.br_blockcount + new->br_blockcount +
3030 right.br_blockcount <= MAXEXTLEN))
3031 state |= BMAP_RIGHT_CONTIG;
3032
3033 error = 0;
3034 /*
3035 * Select which case we're in here, and implement it.
3036 */
3037 switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
3038 case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
3039 /*
3040 * New allocation is contiguous with real allocations on the
3041 * left and on the right.
3042 * Merge all three into a single extent record.
3043 */
3044 --bma->idx;
3045 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
3046 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
3047 left.br_blockcount + new->br_blockcount +
3048 right.br_blockcount);
3049 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
3050
3051 xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
3052
3053 XFS_IFORK_NEXT_SET(bma->ip, whichfork,
3054 XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
3055 if (bma->cur == NULL) {
3056 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
3057 } else {
3058 rval = XFS_ILOG_CORE;
3059 error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
3060 right.br_startblock, right.br_blockcount,
3061 &i);
3062 if (error)
3063 goto done;
3064 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
3065 error = xfs_btree_delete(bma->cur, &i);
3066 if (error)
3067 goto done;
3068 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
3069 error = xfs_btree_decrement(bma->cur, 0, &i);
3070 if (error)
3071 goto done;
3072 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
3073 error = xfs_bmbt_update(bma->cur, left.br_startoff,
3074 left.br_startblock,
3075 left.br_blockcount +
3076 new->br_blockcount +
3077 right.br_blockcount,
3078 left.br_state);
3079 if (error)
3080 goto done;
3081 }
3082 break;
3083
3084 case BMAP_LEFT_CONTIG:
3085 /*
3086 * New allocation is contiguous with a real allocation
3087 * on the left.
3088 * Merge the new allocation with the left neighbor.
3089 */
3090 --bma->idx;
3091 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
3092 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
3093 left.br_blockcount + new->br_blockcount);
3094 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
3095
3096 if (bma->cur == NULL) {
3097 rval = xfs_ilog_fext(whichfork);
3098 } else {
3099 rval = 0;
3100 error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
3101 left.br_startblock, left.br_blockcount,
3102 &i);
3103 if (error)
3104 goto done;
3105 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
3106 error = xfs_bmbt_update(bma->cur, left.br_startoff,
3107 left.br_startblock,
3108 left.br_blockcount +
3109 new->br_blockcount,
3110 left.br_state);
3111 if (error)
3112 goto done;
3113 }
3114 break;
3115
3116 case BMAP_RIGHT_CONTIG:
3117 /*
3118 * New allocation is contiguous with a real allocation
3119 * on the right.
3120 * Merge the new allocation with the right neighbor.
3121 */
3122 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
3123 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
3124 new->br_startoff, new->br_startblock,
3125 new->br_blockcount + right.br_blockcount,
3126 right.br_state);
3127 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
3128
3129 if (bma->cur == NULL) {
3130 rval = xfs_ilog_fext(whichfork);
3131 } else {
3132 rval = 0;
3133 error = xfs_bmbt_lookup_eq(bma->cur,
3134 right.br_startoff,
3135 right.br_startblock,
3136 right.br_blockcount, &i);
3137 if (error)
3138 goto done;
3139 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
3140 error = xfs_bmbt_update(bma->cur, new->br_startoff,
3141 new->br_startblock,
3142 new->br_blockcount +
3143 right.br_blockcount,
3144 right.br_state);
3145 if (error)
3146 goto done;
3147 }
3148 break;
3149
3150 case 0:
3151 /*
3152 * New allocation is not contiguous with another
3153 * real allocation.
3154 * Insert a new entry.
3155 */
3156 xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
3157 XFS_IFORK_NEXT_SET(bma->ip, whichfork,
3158 XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
3159 if (bma->cur == NULL) {
3160 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
3161 } else {
3162 rval = XFS_ILOG_CORE;
3163 error = xfs_bmbt_lookup_eq(bma->cur,
3164 new->br_startoff,
3165 new->br_startblock,
3166 new->br_blockcount, &i);
3167 if (error)
3168 goto done;
3169 XFS_WANT_CORRUPTED_GOTO(i == 0, done);
3170 bma->cur->bc_rec.b.br_state = new->br_state;
3171 error = xfs_btree_insert(bma->cur, &i);
3172 if (error)
3173 goto done;
3174 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
3175 }
3176 break;
3177 }
3178
3179 /* convert to a btree if necessary */
3180 if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
3181 int tmp_logflags; /* partial log flag return val */
3182
3183 ASSERT(bma->cur == NULL);
3184 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
3185 bma->firstblock, bma->flist, &bma->cur,
3186 0, &tmp_logflags, whichfork);
3187 bma->logflags |= tmp_logflags;
3188 if (error)
3189 goto done;
3190 }
3191
3192 /* clear out the allocated field, done with it now in any case. */
3193 if (bma->cur)
3194 bma->cur->bc_private.b.allocated = 0;
3195
3196 xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
3197done:
3198 bma->logflags |= rval;
3199 return error;
3200}
3201
3202/*
3203 * Functions used in the extent read, allocate and remove paths
3204 */
3205
3206/*
3207 * Adjust the size of the new extent based on di_extsize and rt extsize.
3208 */
3209int
3210xfs_bmap_extsize_align(
3211 xfs_mount_t *mp,
3212 xfs_bmbt_irec_t *gotp, /* next extent pointer */
3213 xfs_bmbt_irec_t *prevp, /* previous extent pointer */
3214 xfs_extlen_t extsz, /* align to this extent size */
3215 int rt, /* is this a realtime inode? */
3216 int eof, /* is extent at end-of-file? */
3217 int delay, /* creating delalloc extent? */
3218 int convert, /* overwriting unwritten extent? */
3219 xfs_fileoff_t *offp, /* in/out: aligned offset */
3220 xfs_extlen_t *lenp) /* in/out: aligned length */
3221{
3222 xfs_fileoff_t orig_off; /* original offset */
3223 xfs_extlen_t orig_alen; /* original length */
3224 xfs_fileoff_t orig_end; /* original off+len */
3225 xfs_fileoff_t nexto; /* next file offset */
3226 xfs_fileoff_t prevo; /* previous file offset */
3227 xfs_fileoff_t align_off; /* temp for offset */
3228 xfs_extlen_t align_alen; /* temp for length */
3229 xfs_extlen_t temp; /* temp for calculations */
3230
3231 if (convert)
3232 return 0;
3233
3234 orig_off = align_off = *offp;
3235 orig_alen = align_alen = *lenp;
3236 orig_end = orig_off + orig_alen;
3237
3238 /*
3239 * If this request overlaps an existing extent, then don't
3240 * attempt to perform any additional alignment.
3241 */
3242 if (!delay && !eof &&
3243 (orig_off >= gotp->br_startoff) &&
3244 (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
3245 return 0;
3246 }
3247
3248 /*
3249 * If the file offset is unaligned vs. the extent size
3250 * we need to align it. This will be possible unless
3251 * the file was previously written with a kernel that didn't
3252 * perform this alignment, or if a truncate shot us in the
3253 * foot.
3254 */
3255 temp = do_mod(orig_off, extsz);
3256 if (temp) {
3257 align_alen += temp;
3258 align_off -= temp;
3259 }
3260 /*
3261 * Same adjustment for the end of the requested area.
3262 */
3263 if ((temp = (align_alen % extsz))) {
3264 align_alen += extsz - temp;
3265 }
3266 /*
3267 * If the previous block overlaps with this proposed allocation
3268 * then move the start forward without adjusting the length.
3269 */
3270 if (prevp->br_startoff != NULLFILEOFF) {
3271 if (prevp->br_startblock == HOLESTARTBLOCK)
3272 prevo = prevp->br_startoff;
3273 else
3274 prevo = prevp->br_startoff + prevp->br_blockcount;
3275 } else
3276 prevo = 0;
3277 if (align_off != orig_off && align_off < prevo)
3278 align_off = prevo;
3279 /*
3280 * If the next block overlaps with this proposed allocation
3281 * then move the start back without adjusting the length,
3282 * but not before offset 0.
3283 * This may of course make the start overlap previous block,
3284 * and if we hit the offset 0 limit then the next block
3285 * can still overlap too.
3286 */
3287 if (!eof && gotp->br_startoff != NULLFILEOFF) {
3288 if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
3289 (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
3290 nexto = gotp->br_startoff + gotp->br_blockcount;
3291 else
3292 nexto = gotp->br_startoff;
3293 } else
3294 nexto = NULLFILEOFF;
3295 if (!eof &&
3296 align_off + align_alen != orig_end &&
3297 align_off + align_alen > nexto)
3298 align_off = nexto > align_alen ? nexto - align_alen : 0;
3299 /*
3300 * If we're now overlapping the next or previous extent that
3301 * means we can't fit an extsz piece in this hole. Just move
3302 * the start forward to the first valid spot and set
3303 * the length so we hit the end.
3304 */
3305 if (align_off != orig_off && align_off < prevo)
3306 align_off = prevo;
3307 if (align_off + align_alen != orig_end &&
3308 align_off + align_alen > nexto &&
3309 nexto != NULLFILEOFF) {
3310 ASSERT(nexto > prevo);
3311 align_alen = nexto - align_off;
3312 }
3313
3314 /*
3315 * If realtime, and the result isn't a multiple of the realtime
3316 * extent size we need to remove blocks until it is.
3317 */
3318 if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
3319 /*
3320 * We're not covering the original request, or
3321 * we won't be able to once we fix the length.
3322 */
3323 if (orig_off < align_off ||
3324 orig_end > align_off + align_alen ||
3325 align_alen - temp < orig_alen)
3326 return -EINVAL;
3327 /*
3328 * Try to fix it by moving the start up.
3329 */
3330 if (align_off + temp <= orig_off) {
3331 align_alen -= temp;
3332 align_off += temp;
3333 }
3334 /*
3335 * Try to fix it by moving the end in.
3336 */
3337 else if (align_off + align_alen - temp >= orig_end)
3338 align_alen -= temp;
3339 /*
3340 * Set the start to the minimum then trim the length.
3341 */
3342 else {
3343 align_alen -= orig_off - align_off;
3344 align_off = orig_off;
3345 align_alen -= align_alen % mp->m_sb.sb_rextsize;
3346 }
3347 /*
3348 * Result doesn't cover the request, fail it.
3349 */
3350 if (orig_off < align_off || orig_end > align_off + align_alen)
3351 return -EINVAL;
3352 } else {
3353 ASSERT(orig_off >= align_off);
3354 ASSERT(orig_end <= align_off + align_alen);
3355 }
3356
3357#ifdef DEBUG
3358 if (!eof && gotp->br_startoff != NULLFILEOFF)
3359 ASSERT(align_off + align_alen <= gotp->br_startoff);
3360 if (prevp->br_startoff != NULLFILEOFF)
3361 ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
3362#endif
3363
3364 *lenp = align_alen;
3365 *offp = align_off;
3366 return 0;
3367}
3368
3369#define XFS_ALLOC_GAP_UNITS 4
3370
3371void
3372xfs_bmap_adjacent(
3373 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
3374{
3375 xfs_fsblock_t adjust; /* adjustment to block numbers */
3376 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
3377 xfs_mount_t *mp; /* mount point structure */
3378 int nullfb; /* true if ap->firstblock isn't set */
3379 int rt; /* true if inode is realtime */
3380
3381#define ISVALID(x,y) \
3382 (rt ? \
3383 (x) < mp->m_sb.sb_rblocks : \
3384 XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
3385 XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
3386 XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
3387
3388 mp = ap->ip->i_mount;
3389 nullfb = *ap->firstblock == NULLFSBLOCK;
3390 rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
3391 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
3392 /*
3393 * If allocating at eof, and there's a previous real block,
3394 * try to use its last block as our starting point.
3395 */
3396 if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
3397 !isnullstartblock(ap->prev.br_startblock) &&
3398 ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
3399 ap->prev.br_startblock)) {
3400 ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
3401 /*
3402 * Adjust for the gap between prevp and us.
3403 */
3404 adjust = ap->offset -
3405 (ap->prev.br_startoff + ap->prev.br_blockcount);
3406 if (adjust &&
3407 ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
3408 ap->blkno += adjust;
3409 }
3410 /*
3411 * If not at eof, then compare the two neighbor blocks.
3412 * Figure out whether either one gives us a good starting point,
3413 * and pick the better one.
3414 */
3415 else if (!ap->eof) {
3416 xfs_fsblock_t gotbno; /* right side block number */
3417 xfs_fsblock_t gotdiff=0; /* right side difference */
3418 xfs_fsblock_t prevbno; /* left side block number */
3419 xfs_fsblock_t prevdiff=0; /* left side difference */
3420
3421 /*
3422 * If there's a previous (left) block, select a requested
3423 * start block based on it.
3424 */
3425 if (ap->prev.br_startoff != NULLFILEOFF &&
3426 !isnullstartblock(ap->prev.br_startblock) &&
3427 (prevbno = ap->prev.br_startblock +
3428 ap->prev.br_blockcount) &&
3429 ISVALID(prevbno, ap->prev.br_startblock)) {
3430 /*
3431 * Calculate gap to end of previous block.
3432 */
3433 adjust = prevdiff = ap->offset -
3434 (ap->prev.br_startoff +
3435 ap->prev.br_blockcount);
3436 /*
3437 * Figure the startblock based on the previous block's
3438 * end and the gap size.
3439 * Heuristic!
3440 * If the gap is large relative to the piece we're
3441 * allocating, or using it gives us an invalid block
3442 * number, then just use the end of the previous block.
3443 */
3444 if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
3445 ISVALID(prevbno + prevdiff,
3446 ap->prev.br_startblock))
3447 prevbno += adjust;
3448 else
3449 prevdiff += adjust;
3450 /*
3451 * If the firstblock forbids it, can't use it,
3452 * must use default.
3453 */
3454 if (!rt && !nullfb &&
3455 XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
3456 prevbno = NULLFSBLOCK;
3457 }
3458 /*
3459 * No previous block or can't follow it, just default.
3460 */
3461 else
3462 prevbno = NULLFSBLOCK;
3463 /*
3464 * If there's a following (right) block, select a requested
3465 * start block based on it.
3466 */
3467 if (!isnullstartblock(ap->got.br_startblock)) {
3468 /*
3469 * Calculate gap to start of next block.
3470 */
3471 adjust = gotdiff = ap->got.br_startoff - ap->offset;
3472 /*
3473 * Figure the startblock based on the next block's
3474 * start and the gap size.
3475 */
3476 gotbno = ap->got.br_startblock;
3477 /*
3478 * Heuristic!
3479 * If the gap is large relative to the piece we're
3480 * allocating, or using it gives us an invalid block
3481 * number, then just use the start of the next block
3482 * offset by our length.
3483 */
3484 if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
3485 ISVALID(gotbno - gotdiff, gotbno))
3486 gotbno -= adjust;
3487 else if (ISVALID(gotbno - ap->length, gotbno)) {
3488 gotbno -= ap->length;
3489 gotdiff += adjust - ap->length;
3490 } else
3491 gotdiff += adjust;
3492 /*
3493 * If the firstblock forbids it, can't use it,
3494 * must use default.
3495 */
3496 if (!rt && !nullfb &&
3497 XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
3498 gotbno = NULLFSBLOCK;
3499 }
3500 /*
3501 * No next block, just default.
3502 */
3503 else
3504 gotbno = NULLFSBLOCK;
3505 /*
3506 * If both valid, pick the better one, else the only good
3507 * one, else ap->blkno is already set (to 0 or the inode block).
3508 */
3509 if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
3510 ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
3511 else if (prevbno != NULLFSBLOCK)
3512 ap->blkno = prevbno;
3513 else if (gotbno != NULLFSBLOCK)
3514 ap->blkno = gotbno;
3515 }
3516#undef ISVALID
3517}
3518
3519static int
3520xfs_bmap_longest_free_extent(
3521 struct xfs_trans *tp,
3522 xfs_agnumber_t ag,
3523 xfs_extlen_t *blen,
3524 int *notinit)
3525{
3526 struct xfs_mount *mp = tp->t_mountp;
3527 struct xfs_perag *pag;
3528 xfs_extlen_t longest;
3529 int error = 0;
3530
3531 pag = xfs_perag_get(mp, ag);
3532 if (!pag->pagf_init) {
3533 error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
3534 if (error)
3535 goto out;
3536
3537 if (!pag->pagf_init) {
3538 *notinit = 1;
3539 goto out;
3540 }
3541 }
3542
3543 longest = xfs_alloc_longest_free_extent(mp, pag);
3544 if (*blen < longest)
3545 *blen = longest;
3546
3547out:
3548 xfs_perag_put(pag);
3549 return error;
3550}
3551
3552static void
3553xfs_bmap_select_minlen(
3554 struct xfs_bmalloca *ap,
3555 struct xfs_alloc_arg *args,
3556 xfs_extlen_t *blen,
3557 int notinit)
3558{
3559 if (notinit || *blen < ap->minlen) {
3560 /*
3561 * Since we did a BUF_TRYLOCK above, it is possible that
3562 * there is space for this request.
3563 */
3564 args->minlen = ap->minlen;
3565 } else if (*blen < args->maxlen) {
3566 /*
3567 * If the best seen length is less than the request length,
3568 * use the best as the minimum.
3569 */
3570 args->minlen = *blen;
3571 } else {
3572 /*
3573 * Otherwise we've seen an extent as big as maxlen, use that
3574 * as the minimum.
3575 */
3576 args->minlen = args->maxlen;
3577 }
3578}
3579
3580STATIC int
3581xfs_bmap_btalloc_nullfb(
3582 struct xfs_bmalloca *ap,
3583 struct xfs_alloc_arg *args,
3584 xfs_extlen_t *blen)
3585{
3586 struct xfs_mount *mp = ap->ip->i_mount;
3587 xfs_agnumber_t ag, startag;
3588 int notinit = 0;
3589 int error;
3590
3591 args->type = XFS_ALLOCTYPE_START_BNO;
3592 args->total = ap->total;
3593
3594 startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
3595 if (startag == NULLAGNUMBER)
3596 startag = ag = 0;
3597
3598 while (*blen < args->maxlen) {
3599 error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
3600 &notinit);
3601 if (error)
3602 return error;
3603
3604 if (++ag == mp->m_sb.sb_agcount)
3605 ag = 0;
3606 if (ag == startag)
3607 break;
3608 }
3609
3610 xfs_bmap_select_minlen(ap, args, blen, notinit);
3611 return 0;
3612}
3613
3614STATIC int
3615xfs_bmap_btalloc_filestreams(
3616 struct xfs_bmalloca *ap,
3617 struct xfs_alloc_arg *args,
3618 xfs_extlen_t *blen)
3619{
3620 struct xfs_mount *mp = ap->ip->i_mount;
3621 xfs_agnumber_t ag;
3622 int notinit = 0;
3623 int error;
3624
3625 args->type = XFS_ALLOCTYPE_NEAR_BNO;
3626 args->total = ap->total;
3627
3628 ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
3629 if (ag == NULLAGNUMBER)
3630 ag = 0;
3631
3632 error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
3633 if (error)
3634 return error;
3635
3636 if (*blen < args->maxlen) {
3637 error = xfs_filestream_new_ag(ap, &ag);
3638 if (error)
3639 return error;
3640
3641 error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
3642 &notinit);
3643 if (error)
3644 return error;
3645
3646 }
3647
3648 xfs_bmap_select_minlen(ap, args, blen, notinit);
3649
3650 /*
3651 * Set the failure fallback case to look in the selected AG as stream
3652 * may have moved.
3653 */
3654 ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
3655 return 0;
3656}
3657
3658STATIC int
3659xfs_bmap_btalloc(
3660 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
3661{
3662 xfs_mount_t *mp; /* mount point structure */
3663 xfs_alloctype_t atype = 0; /* type for allocation routines */
3664 xfs_extlen_t align; /* minimum allocation alignment */
3665 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
3666 xfs_agnumber_t ag;
3667 xfs_alloc_arg_t args;
3668 xfs_extlen_t blen;
3669 xfs_extlen_t nextminlen = 0;
3670 int nullfb; /* true if ap->firstblock isn't set */
3671 int isaligned;
3672 int tryagain;
3673 int error;
3674 int stripe_align;
3675
3676 ASSERT(ap->length);
3677
3678 mp = ap->ip->i_mount;
3679
3680 /* stripe alignment for allocation is determined by mount parameters */
3681 stripe_align = 0;
3682 if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
3683 stripe_align = mp->m_swidth;
3684 else if (mp->m_dalign)
3685 stripe_align = mp->m_dalign;
3686
3687 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
3688 if (unlikely(align)) {
3689 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
3690 align, 0, ap->eof, 0, ap->conv,
3691 &ap->offset, &ap->length);
3692 ASSERT(!error);
3693 ASSERT(ap->length);
3694 }
3695
3696
3697 nullfb = *ap->firstblock == NULLFSBLOCK;
3698 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
3699 if (nullfb) {
3700 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
3701 ag = xfs_filestream_lookup_ag(ap->ip);
3702 ag = (ag != NULLAGNUMBER) ? ag : 0;
3703 ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
3704 } else {
3705 ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
3706 }
3707 } else
3708 ap->blkno = *ap->firstblock;
3709
3710 xfs_bmap_adjacent(ap);
3711
3712 /*
3713 * If allowed, use ap->blkno; otherwise must use firstblock since
3714 * it's in the right allocation group.
3715 */
3716 if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
3717 ;
3718 else
3719 ap->blkno = *ap->firstblock;
3720 /*
3721 * Normal allocation, done through xfs_alloc_vextent.
3722 */
3723 tryagain = isaligned = 0;
3724 memset(&args, 0, sizeof(args));
3725 args.tp = ap->tp;
3726 args.mp = mp;
3727 args.fsbno = ap->blkno;
3728
3729 /* Trim the allocation back to the maximum an AG can fit. */
3730 args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
3731 args.firstblock = *ap->firstblock;
3732 blen = 0;
3733 if (nullfb) {
3734 /*
3735 * Search for an allocation group with a single extent large
3736 * enough for the request. If one isn't found, then adjust
3737 * the minimum allocation size to the largest space found.
3738 */
3739 if (ap->userdata && xfs_inode_is_filestream(ap->ip))
3740 error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
3741 else
3742 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
3743 if (error)
3744 return error;
3745 } else if (ap->flist->xbf_low) {
3746 if (xfs_inode_is_filestream(ap->ip))
3747 args.type = XFS_ALLOCTYPE_FIRST_AG;
3748 else
3749 args.type = XFS_ALLOCTYPE_START_BNO;
3750 args.total = args.minlen = ap->minlen;
3751 } else {
3752 args.type = XFS_ALLOCTYPE_NEAR_BNO;
3753 args.total = ap->total;
3754 args.minlen = ap->minlen;
3755 }
3756 /* apply extent size hints if obtained earlier */
3757 if (unlikely(align)) {
3758 args.prod = align;
3759 if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
3760 args.mod = (xfs_extlen_t)(args.prod - args.mod);
3761 } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
3762 args.prod = 1;
3763 args.mod = 0;
3764 } else {
3765 args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
3766 if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
3767 args.mod = (xfs_extlen_t)(args.prod - args.mod);
3768 }
3769 /*
3770 * If we are not low on available data blocks, and the
3771 * underlying logical volume manager is a stripe, and
3772 * the file offset is zero then try to allocate data
3773 * blocks on stripe unit boundary.
3774 * NOTE: ap->aeof is only set if the allocation length
3775 * is >= the stripe unit and the allocation offset is
3776 * at the end of file.
3777 */
3778 if (!ap->flist->xbf_low && ap->aeof) {
3779 if (!ap->offset) {
3780 args.alignment = stripe_align;
3781 atype = args.type;
3782 isaligned = 1;
3783 /*
3784 * Adjust for alignment
3785 */
3786 if (blen > args.alignment && blen <= args.maxlen)
3787 args.minlen = blen - args.alignment;
3788 args.minalignslop = 0;
3789 } else {
3790 /*
3791 * First try an exact bno allocation.
3792 * If it fails then do a near or start bno
3793 * allocation with alignment turned on.
3794 */
3795 atype = args.type;
3796 tryagain = 1;
3797 args.type = XFS_ALLOCTYPE_THIS_BNO;
3798 args.alignment = 1;
3799 /*
3800 * Compute the minlen+alignment for the
3801 * next case. Set slop so that the value
3802 * of minlen+alignment+slop doesn't go up
3803 * between the calls.
3804 */
3805 if (blen > stripe_align && blen <= args.maxlen)
3806 nextminlen = blen - stripe_align;
3807 else
3808 nextminlen = args.minlen;
3809 if (nextminlen + stripe_align > args.minlen + 1)
3810 args.minalignslop =
3811 nextminlen + stripe_align -
3812 args.minlen - 1;
3813 else
3814 args.minalignslop = 0;
3815 }
3816 } else {
3817 args.alignment = 1;
3818 args.minalignslop = 0;
3819 }
3820 args.minleft = ap->minleft;
3821 args.wasdel = ap->wasdel;
3822 args.isfl = 0;
3823 args.userdata = ap->userdata;
3824 if ((error = xfs_alloc_vextent(&args)))
3825 return error;
3826 if (tryagain && args.fsbno == NULLFSBLOCK) {
3827 /*
3828 * Exact allocation failed. Now try with alignment
3829 * turned on.
3830 */
3831 args.type = atype;
3832 args.fsbno = ap->blkno;
3833 args.alignment = stripe_align;
3834 args.minlen = nextminlen;
3835 args.minalignslop = 0;
3836 isaligned = 1;
3837 if ((error = xfs_alloc_vextent(&args)))
3838 return error;
3839 }
3840 if (isaligned && args.fsbno == NULLFSBLOCK) {
3841 /*
3842 * allocation failed, so turn off alignment and
3843 * try again.
3844 */
3845 args.type = atype;
3846 args.fsbno = ap->blkno;
3847 args.alignment = 0;
3848 if ((error = xfs_alloc_vextent(&args)))
3849 return error;
3850 }
3851 if (args.fsbno == NULLFSBLOCK && nullfb &&
3852 args.minlen > ap->minlen) {
3853 args.minlen = ap->minlen;
3854 args.type = XFS_ALLOCTYPE_START_BNO;
3855 args.fsbno = ap->blkno;
3856 if ((error = xfs_alloc_vextent(&args)))
3857 return error;
3858 }
3859 if (args.fsbno == NULLFSBLOCK && nullfb) {
3860 args.fsbno = 0;
3861 args.type = XFS_ALLOCTYPE_FIRST_AG;
3862 args.total = ap->minlen;
3863 args.minleft = 0;
3864 if ((error = xfs_alloc_vextent(&args)))
3865 return error;
3866 ap->flist->xbf_low = 1;
3867 }
3868 if (args.fsbno != NULLFSBLOCK) {
3869 /*
3870 * check the allocation happened at the same or higher AG than
3871 * the first block that was allocated.
3872 */
3873 ASSERT(*ap->firstblock == NULLFSBLOCK ||
3874 XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
3875 XFS_FSB_TO_AGNO(mp, args.fsbno) ||
3876 (ap->flist->xbf_low &&
3877 XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
3878 XFS_FSB_TO_AGNO(mp, args.fsbno)));
3879
3880 ap->blkno = args.fsbno;
3881 if (*ap->firstblock == NULLFSBLOCK)
3882 *ap->firstblock = args.fsbno;
3883 ASSERT(nullfb || fb_agno == args.agno ||
3884 (ap->flist->xbf_low && fb_agno < args.agno));
3885 ap->length = args.len;
3886 ap->ip->i_d.di_nblocks += args.len;
3887 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
3888 if (ap->wasdel)
3889 ap->ip->i_delayed_blks -= args.len;
3890 /*
3891 * Adjust the disk quota also. This was reserved
3892 * earlier.
3893 */
3894 xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
3895 ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
3896 XFS_TRANS_DQ_BCOUNT,
3897 (long) args.len);
3898 } else {
3899 ap->blkno = NULLFSBLOCK;
3900 ap->length = 0;
3901 }
3902 return 0;
3903}
3904
3905/*
3906 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
3907 * It figures out where to ask the underlying allocator to put the new extent.
3908 */
3909STATIC int
3910xfs_bmap_alloc(
3911 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
3912{
3913 if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
3914 return xfs_bmap_rtalloc(ap);
3915 return xfs_bmap_btalloc(ap);
3916}
3917
3918/*
3919 * Trim the returned map to the required bounds
3920 */
3921STATIC void
3922xfs_bmapi_trim_map(
3923 struct xfs_bmbt_irec *mval,
3924 struct xfs_bmbt_irec *got,
3925 xfs_fileoff_t *bno,
3926 xfs_filblks_t len,
3927 xfs_fileoff_t obno,
3928 xfs_fileoff_t end,
3929 int n,
3930 int flags)
3931{
3932 if ((flags & XFS_BMAPI_ENTIRE) ||
3933 got->br_startoff + got->br_blockcount <= obno) {
3934 *mval = *got;
3935 if (isnullstartblock(got->br_startblock))
3936 mval->br_startblock = DELAYSTARTBLOCK;
3937 return;
3938 }
3939
3940 if (obno > *bno)
3941 *bno = obno;
3942 ASSERT((*bno >= obno) || (n == 0));
3943 ASSERT(*bno < end);
3944 mval->br_startoff = *bno;
3945 if (isnullstartblock(got->br_startblock))
3946 mval->br_startblock = DELAYSTARTBLOCK;
3947 else
3948 mval->br_startblock = got->br_startblock +
3949 (*bno - got->br_startoff);
3950 /*
3951 * Return the minimum of what we got and what we asked for for
3952 * the length. We can use the len variable here because it is
3953 * modified below and we could have been there before coming
3954 * here if the first part of the allocation didn't overlap what
3955 * was asked for.
3956 */
3957 mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
3958 got->br_blockcount - (*bno - got->br_startoff));
3959 mval->br_state = got->br_state;
3960 ASSERT(mval->br_blockcount <= len);
3961 return;
3962}
3963
3964/*
3965 * Update and validate the extent map to return
3966 */
3967STATIC void
3968xfs_bmapi_update_map(
3969 struct xfs_bmbt_irec **map,
3970 xfs_fileoff_t *bno,
3971 xfs_filblks_t *len,
3972 xfs_fileoff_t obno,
3973 xfs_fileoff_t end,
3974 int *n,
3975 int flags)
3976{
3977 xfs_bmbt_irec_t *mval = *map;
3978
3979 ASSERT((flags & XFS_BMAPI_ENTIRE) ||
3980 ((mval->br_startoff + mval->br_blockcount) <= end));
3981 ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
3982 (mval->br_startoff < obno));
3983
3984 *bno = mval->br_startoff + mval->br_blockcount;
3985 *len = end - *bno;
3986 if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
3987 /* update previous map with new information */
3988 ASSERT(mval->br_startblock == mval[-1].br_startblock);
3989 ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
3990 ASSERT(mval->br_state == mval[-1].br_state);
3991 mval[-1].br_blockcount = mval->br_blockcount;
3992 mval[-1].br_state = mval->br_state;
3993 } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
3994 mval[-1].br_startblock != DELAYSTARTBLOCK &&
3995 mval[-1].br_startblock != HOLESTARTBLOCK &&
3996 mval->br_startblock == mval[-1].br_startblock +
3997 mval[-1].br_blockcount &&
3998 ((flags & XFS_BMAPI_IGSTATE) ||
3999 mval[-1].br_state == mval->br_state)) {
4000 ASSERT(mval->br_startoff ==
4001 mval[-1].br_startoff + mval[-1].br_blockcount);
4002 mval[-1].br_blockcount += mval->br_blockcount;
4003 } else if (*n > 0 &&
4004 mval->br_startblock == DELAYSTARTBLOCK &&
4005 mval[-1].br_startblock == DELAYSTARTBLOCK &&
4006 mval->br_startoff ==
4007 mval[-1].br_startoff + mval[-1].br_blockcount) {
4008 mval[-1].br_blockcount += mval->br_blockcount;
4009 mval[-1].br_state = mval->br_state;
4010 } else if (!((*n == 0) &&
4011 ((mval->br_startoff + mval->br_blockcount) <=
4012 obno))) {
4013 mval++;
4014 (*n)++;
4015 }
4016 *map = mval;
4017}
4018
4019/*
4020 * Map file blocks to filesystem blocks without allocation.
4021 */
4022int
4023xfs_bmapi_read(
4024 struct xfs_inode *ip,
4025 xfs_fileoff_t bno,
4026 xfs_filblks_t len,
4027 struct xfs_bmbt_irec *mval,
4028 int *nmap,
4029 int flags)
4030{
4031 struct xfs_mount *mp = ip->i_mount;
4032 struct xfs_ifork *ifp;
4033 struct xfs_bmbt_irec got;
4034 struct xfs_bmbt_irec prev;
4035 xfs_fileoff_t obno;
4036 xfs_fileoff_t end;
4037 xfs_extnum_t lastx;
4038 int error;
4039 int eof;
4040 int n = 0;
4041 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4042 XFS_ATTR_FORK : XFS_DATA_FORK;
4043
4044 ASSERT(*nmap >= 1);
4045 ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
4046 XFS_BMAPI_IGSTATE)));
4047 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
4048
4049 if (unlikely(XFS_TEST_ERROR(
4050 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4051 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
4052 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4053 XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
4054 return -EFSCORRUPTED;
4055 }
4056
4057 if (XFS_FORCED_SHUTDOWN(mp))
4058 return -EIO;
4059
4060 XFS_STATS_INC(xs_blk_mapr);
4061
4062 ifp = XFS_IFORK_PTR(ip, whichfork);
4063
4064 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4065 error = xfs_iread_extents(NULL, ip, whichfork);
4066 if (error)
4067 return error;
4068 }
4069
4070 xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
4071 end = bno + len;
4072 obno = bno;
4073
4074 while (bno < end && n < *nmap) {
4075 /* Reading past eof, act as though there's a hole up to end. */
4076 if (eof)
4077 got.br_startoff = end;
4078 if (got.br_startoff > bno) {
4079 /* Reading in a hole. */
4080 mval->br_startoff = bno;
4081 mval->br_startblock = HOLESTARTBLOCK;
4082 mval->br_blockcount =
4083 XFS_FILBLKS_MIN(len, got.br_startoff - bno);
4084 mval->br_state = XFS_EXT_NORM;
4085 bno += mval->br_blockcount;
4086 len -= mval->br_blockcount;
4087 mval++;
4088 n++;
4089 continue;
4090 }
4091
4092 /* set up the extent map to return. */
4093 xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
4094 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4095
4096 /* If we're done, stop now. */
4097 if (bno >= end || n >= *nmap)
4098 break;
4099
4100 /* Else go on to the next record. */
4101 if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
4102 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
4103 else
4104 eof = 1;
4105 }
4106 *nmap = n;
4107 return 0;
4108}
4109
4110STATIC int
4111xfs_bmapi_reserve_delalloc(
4112 struct xfs_inode *ip,
4113 xfs_fileoff_t aoff,
4114 xfs_filblks_t len,
4115 struct xfs_bmbt_irec *got,
4116 struct xfs_bmbt_irec *prev,
4117 xfs_extnum_t *lastx,
4118 int eof)
4119{
4120 struct xfs_mount *mp = ip->i_mount;
4121 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
4122 xfs_extlen_t alen;
4123 xfs_extlen_t indlen;
4124 char rt = XFS_IS_REALTIME_INODE(ip);
4125 xfs_extlen_t extsz;
4126 int error;
4127
4128 alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
4129 if (!eof)
4130 alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
4131
4132 /* Figure out the extent size, adjust alen */
4133 extsz = xfs_get_extsz_hint(ip);
4134 if (extsz) {
4135 /*
4136 * Make sure we don't exceed a single extent length when we
4137 * align the extent by reducing length we are going to
4138 * allocate by the maximum amount extent size aligment may
4139 * require.
4140 */
4141 alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
4142 error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
4143 1, 0, &aoff, &alen);
4144 ASSERT(!error);
4145 }
4146
4147 if (rt)
4148 extsz = alen / mp->m_sb.sb_rextsize;
4149
4150 /*
4151 * Make a transaction-less quota reservation for delayed allocation
4152 * blocks. This number gets adjusted later. We return if we haven't
4153 * allocated blocks already inside this loop.
4154 */
4155 error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
4156 rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4157 if (error)
4158 return error;
4159
4160 /*
4161 * Split changing sb for alen and indlen since they could be coming
4162 * from different places.
4163 */
4164 indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
4165 ASSERT(indlen > 0);
4166
4167 if (rt) {
4168 error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
4169 -((int64_t)extsz), 0);
4170 } else {
4171 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
4172 -((int64_t)alen), 0);
4173 }
4174
4175 if (error)
4176 goto out_unreserve_quota;
4177
4178 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
4179 -((int64_t)indlen), 0);
4180 if (error)
4181 goto out_unreserve_blocks;
4182
4183
4184 ip->i_delayed_blks += alen;
4185
4186 got->br_startoff = aoff;
4187 got->br_startblock = nullstartblock(indlen);
4188 got->br_blockcount = alen;
4189 got->br_state = XFS_EXT_NORM;
4190 xfs_bmap_add_extent_hole_delay(ip, lastx, got);
4191
4192 /*
4193 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
4194 * might have merged it into one of the neighbouring ones.
4195 */
4196 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
4197
4198 ASSERT(got->br_startoff <= aoff);
4199 ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
4200 ASSERT(isnullstartblock(got->br_startblock));
4201 ASSERT(got->br_state == XFS_EXT_NORM);
4202 return 0;
4203
4204out_unreserve_blocks:
4205 if (rt)
4206 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
4207 else
4208 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
4209out_unreserve_quota:
4210 if (XFS_IS_QUOTA_ON(mp))
4211 xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
4212 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4213 return error;
4214}
4215
4216/*
4217 * Map file blocks to filesystem blocks, adding delayed allocations as needed.
4218 */
4219int
4220xfs_bmapi_delay(
4221 struct xfs_inode *ip, /* incore inode */
4222 xfs_fileoff_t bno, /* starting file offs. mapped */
4223 xfs_filblks_t len, /* length to map in file */
4224 struct xfs_bmbt_irec *mval, /* output: map values */
4225 int *nmap, /* i/o: mval size/count */
4226 int flags) /* XFS_BMAPI_... */
4227{
4228 struct xfs_mount *mp = ip->i_mount;
4229 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
4230 struct xfs_bmbt_irec got; /* current file extent record */
4231 struct xfs_bmbt_irec prev; /* previous file extent record */
4232 xfs_fileoff_t obno; /* old block number (offset) */
4233 xfs_fileoff_t end; /* end of mapped file region */
4234 xfs_extnum_t lastx; /* last useful extent number */
4235 int eof; /* we've hit the end of extents */
4236 int n = 0; /* current extent index */
4237 int error = 0;
4238
4239 ASSERT(*nmap >= 1);
4240 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4241 ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
4242 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
4243
4244 if (unlikely(XFS_TEST_ERROR(
4245 (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
4246 XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
4247 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4248 XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
4249 return -EFSCORRUPTED;
4250 }
4251
4252 if (XFS_FORCED_SHUTDOWN(mp))
4253 return -EIO;
4254
4255 XFS_STATS_INC(xs_blk_mapw);
4256
4257 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4258 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
4259 if (error)
4260 return error;
4261 }
4262
4263 xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
4264 end = bno + len;
4265 obno = bno;
4266
4267 while (bno < end && n < *nmap) {
4268 if (eof || got.br_startoff > bno) {
4269 error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
4270 &prev, &lastx, eof);
4271 if (error) {
4272 if (n == 0) {
4273 *nmap = 0;
4274 return error;
4275 }
4276 break;
4277 }
4278 }
4279
4280 /* set up the extent map to return. */
4281 xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
4282 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4283
4284 /* If we're done, stop now. */
4285 if (bno >= end || n >= *nmap)
4286 break;
4287
4288 /* Else go on to the next record. */
4289 prev = got;
4290 if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
4291 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
4292 else
4293 eof = 1;
4294 }
4295
4296 *nmap = n;
4297 return 0;
4298}
4299
4300
4301static int
4302xfs_bmapi_allocate(
4303 struct xfs_bmalloca *bma)
4304{
4305 struct xfs_mount *mp = bma->ip->i_mount;
4306 int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
4307 XFS_ATTR_FORK : XFS_DATA_FORK;
4308 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4309 int tmp_logflags = 0;
4310 int error;
4311
4312 ASSERT(bma->length > 0);
4313
4314 /*
4315 * For the wasdelay case, we could also just allocate the stuff asked
4316 * for in this bmap call but that wouldn't be as good.
4317 */
4318 if (bma->wasdel) {
4319 bma->length = (xfs_extlen_t)bma->got.br_blockcount;
4320 bma->offset = bma->got.br_startoff;
4321 if (bma->idx != NULLEXTNUM && bma->idx) {
4322 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
4323 &bma->prev);
4324 }
4325 } else {
4326 bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
4327 if (!bma->eof)
4328 bma->length = XFS_FILBLKS_MIN(bma->length,
4329 bma->got.br_startoff - bma->offset);
4330 }
4331
4332 /*
4333 * Indicate if this is the first user data in the file, or just any
4334 * user data.
4335 */
4336 if (!(bma->flags & XFS_BMAPI_METADATA)) {
4337 bma->userdata = (bma->offset == 0) ?
4338 XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
4339 }
4340
4341 bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
4342
4343 /*
4344 * Only want to do the alignment at the eof if it is userdata and
4345 * allocation length is larger than a stripe unit.
4346 */
4347 if (mp->m_dalign && bma->length >= mp->m_dalign &&
4348 !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
4349 error = xfs_bmap_isaeof(bma, whichfork);
4350 if (error)
4351 return error;
4352 }
4353
4354 error = xfs_bmap_alloc(bma);
4355 if (error)
4356 return error;
4357
4358 if (bma->flist->xbf_low)
4359 bma->minleft = 0;
4360 if (bma->cur)
4361 bma->cur->bc_private.b.firstblock = *bma->firstblock;
4362 if (bma->blkno == NULLFSBLOCK)
4363 return 0;
4364 if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
4365 bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
4366 bma->cur->bc_private.b.firstblock = *bma->firstblock;
4367 bma->cur->bc_private.b.flist = bma->flist;
4368 }
4369 /*
4370 * Bump the number of extents we've allocated
4371 * in this call.
4372 */
4373 bma->nallocs++;
4374
4375 if (bma->cur)
4376 bma->cur->bc_private.b.flags =
4377 bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
4378
4379 bma->got.br_startoff = bma->offset;
4380 bma->got.br_startblock = bma->blkno;
4381 bma->got.br_blockcount = bma->length;
4382 bma->got.br_state = XFS_EXT_NORM;
4383
4384 /*
4385 * A wasdelay extent has been initialized, so shouldn't be flagged
4386 * as unwritten.
4387 */
4388 if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
4389 xfs_sb_version_hasextflgbit(&mp->m_sb))
4390 bma->got.br_state = XFS_EXT_UNWRITTEN;
4391
4392 if (bma->wasdel)
4393 error = xfs_bmap_add_extent_delay_real(bma);
4394 else
4395 error = xfs_bmap_add_extent_hole_real(bma, whichfork);
4396
4397 bma->logflags |= tmp_logflags;
4398 if (error)
4399 return error;
4400
4401 /*
4402 * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
4403 * or xfs_bmap_add_extent_hole_real might have merged it into one of
4404 * the neighbouring ones.
4405 */
4406 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
4407
4408 ASSERT(bma->got.br_startoff <= bma->offset);
4409 ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
4410 bma->offset + bma->length);
4411 ASSERT(bma->got.br_state == XFS_EXT_NORM ||
4412 bma->got.br_state == XFS_EXT_UNWRITTEN);
4413 return 0;
4414}
4415
4416STATIC int
4417xfs_bmapi_convert_unwritten(
4418 struct xfs_bmalloca *bma,
4419 struct xfs_bmbt_irec *mval,
4420 xfs_filblks_t len,
4421 int flags)
4422{
4423 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4424 XFS_ATTR_FORK : XFS_DATA_FORK;
4425 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4426 int tmp_logflags = 0;
4427 int error;
4428
4429 /* check if we need to do unwritten->real conversion */
4430 if (mval->br_state == XFS_EXT_UNWRITTEN &&
4431 (flags & XFS_BMAPI_PREALLOC))
4432 return 0;
4433
4434 /* check if we need to do real->unwritten conversion */
4435 if (mval->br_state == XFS_EXT_NORM &&
4436 (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
4437 (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
4438 return 0;
4439
4440 /*
4441 * Modify (by adding) the state flag, if writing.
4442 */
4443 ASSERT(mval->br_blockcount <= len);
4444 if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
4445 bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
4446 bma->ip, whichfork);
4447 bma->cur->bc_private.b.firstblock = *bma->firstblock;
4448 bma->cur->bc_private.b.flist = bma->flist;
4449 }
4450 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4451 ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
4452
4453 error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
4454 &bma->cur, mval, bma->firstblock, bma->flist,
4455 &tmp_logflags);
4456 bma->logflags |= tmp_logflags;
4457 if (error)
4458 return error;
4459
4460 /*
4461 * Update our extent pointer, given that
4462 * xfs_bmap_add_extent_unwritten_real might have merged it into one
4463 * of the neighbouring ones.
4464 */
4465 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
4466
4467 /*
4468 * We may have combined previously unwritten space with written space,
4469 * so generate another request.
4470 */
4471 if (mval->br_blockcount < len)
4472 return -EAGAIN;
4473 return 0;
4474}
4475
4476/*
4477 * Map file blocks to filesystem blocks, and allocate blocks or convert the
4478 * extent state if necessary. Details behaviour is controlled by the flags
4479 * parameter. Only allocates blocks from a single allocation group, to avoid
4480 * locking problems.
4481 *
4482 * The returned value in "firstblock" from the first call in a transaction
4483 * must be remembered and presented to subsequent calls in "firstblock".
4484 * An upper bound for the number of blocks to be allocated is supplied to
4485 * the first call in "total"; if no allocation group has that many free
4486 * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
4487 */
4488int
4489xfs_bmapi_write(
4490 struct xfs_trans *tp, /* transaction pointer */
4491 struct xfs_inode *ip, /* incore inode */
4492 xfs_fileoff_t bno, /* starting file offs. mapped */
4493 xfs_filblks_t len, /* length to map in file */
4494 int flags, /* XFS_BMAPI_... */
4495 xfs_fsblock_t *firstblock, /* first allocated block
4496 controls a.g. for allocs */
4497 xfs_extlen_t total, /* total blocks needed */
4498 struct xfs_bmbt_irec *mval, /* output: map values */
4499 int *nmap, /* i/o: mval size/count */
4500 struct xfs_bmap_free *flist) /* i/o: list extents to free */
4501{
4502 struct xfs_mount *mp = ip->i_mount;
4503 struct xfs_ifork *ifp;
4504 struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
4505 xfs_fileoff_t end; /* end of mapped file region */
4506 int eof; /* after the end of extents */
4507 int error; /* error return */
4508 int n; /* current extent index */
4509 xfs_fileoff_t obno; /* old block number (offset) */
4510 int whichfork; /* data or attr fork */
4511 char inhole; /* current location is hole in file */
4512 char wasdelay; /* old extent was delayed */
4513
4514#ifdef DEBUG
4515 xfs_fileoff_t orig_bno; /* original block number value */
4516 int orig_flags; /* original flags arg value */
4517 xfs_filblks_t orig_len; /* original value of len arg */
4518 struct xfs_bmbt_irec *orig_mval; /* original value of mval */
4519 int orig_nmap; /* original value of *nmap */
4520
4521 orig_bno = bno;
4522 orig_len = len;
4523 orig_flags = flags;
4524 orig_mval = mval;
4525 orig_nmap = *nmap;
4526#endif
4527 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4528 XFS_ATTR_FORK : XFS_DATA_FORK;
4529
4530 ASSERT(*nmap >= 1);
4531 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4532 ASSERT(!(flags & XFS_BMAPI_IGSTATE));
4533 ASSERT(tp != NULL);
4534 ASSERT(len > 0);
4535 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
4536 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
4537
4538 if (unlikely(XFS_TEST_ERROR(
4539 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4540 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
4541 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4542 XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
4543 return -EFSCORRUPTED;
4544 }
4545
4546 if (XFS_FORCED_SHUTDOWN(mp))
4547 return -EIO;
4548
4549 ifp = XFS_IFORK_PTR(ip, whichfork);
4550
4551 XFS_STATS_INC(xs_blk_mapw);
4552
4553 if (*firstblock == NULLFSBLOCK) {
4554 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
4555 bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
4556 else
4557 bma.minleft = 1;
4558 } else {
4559 bma.minleft = 0;
4560 }
4561
4562 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4563 error = xfs_iread_extents(tp, ip, whichfork);
4564 if (error)
4565 goto error0;
4566 }
4567
4568 xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
4569 &bma.prev);
4570 n = 0;
4571 end = bno + len;
4572 obno = bno;
4573
4574 bma.tp = tp;
4575 bma.ip = ip;
4576 bma.total = total;
4577 bma.userdata = 0;
4578 bma.flist = flist;
4579 bma.firstblock = firstblock;
4580
4581 while (bno < end && n < *nmap) {
4582 inhole = eof || bma.got.br_startoff > bno;
4583 wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
4584
4585 /*
4586 * First, deal with the hole before the allocated space
4587 * that we found, if any.
4588 */
4589 if (inhole || wasdelay) {
4590 bma.eof = eof;
4591 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4592 bma.wasdel = wasdelay;
4593 bma.offset = bno;
4594 bma.flags = flags;
4595
4596 /*
4597 * There's a 32/64 bit type mismatch between the
4598 * allocation length request (which can be 64 bits in
4599 * length) and the bma length request, which is
4600 * xfs_extlen_t and therefore 32 bits. Hence we have to
4601 * check for 32-bit overflows and handle them here.
4602 */
4603 if (len > (xfs_filblks_t)MAXEXTLEN)
4604 bma.length = MAXEXTLEN;
4605 else
4606 bma.length = len;
4607
4608 ASSERT(len > 0);
4609 ASSERT(bma.length > 0);
4610 error = xfs_bmapi_allocate(&bma);
4611 if (error)
4612 goto error0;
4613 if (bma.blkno == NULLFSBLOCK)
4614 break;
4615 }
4616
4617 /* Deal with the allocated space we found. */
4618 xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
4619 end, n, flags);
4620
4621 /* Execute unwritten extent conversion if necessary */
4622 error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
4623 if (error == -EAGAIN)
4624 continue;
4625 if (error)
4626 goto error0;
4627
4628 /* update the extent map to return */
4629 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4630
4631 /*
4632 * If we're done, stop now. Stop when we've allocated
4633 * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise
4634 * the transaction may get too big.
4635 */
4636 if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
4637 break;
4638
4639 /* Else go on to the next record. */
4640 bma.prev = bma.got;
4641 if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
4642 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
4643 &bma.got);
4644 } else
4645 eof = 1;
4646 }
4647 *nmap = n;
4648
4649 /*
4650 * Transform from btree to extents, give it cur.
4651 */
4652 if (xfs_bmap_wants_extents(ip, whichfork)) {
4653 int tmp_logflags = 0;
4654
4655 ASSERT(bma.cur);
4656 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
4657 &tmp_logflags, whichfork);
4658 bma.logflags |= tmp_logflags;
4659 if (error)
4660 goto error0;
4661 }
4662
4663 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
4664 XFS_IFORK_NEXTENTS(ip, whichfork) >
4665 XFS_IFORK_MAXEXT(ip, whichfork));
4666 error = 0;
4667error0:
4668 /*
4669 * Log everything. Do this after conversion, there's no point in
4670 * logging the extent records if we've converted to btree format.
4671 */
4672 if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
4673 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
4674 bma.logflags &= ~xfs_ilog_fext(whichfork);
4675 else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
4676 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
4677 bma.logflags &= ~xfs_ilog_fbroot(whichfork);
4678 /*
4679 * Log whatever the flags say, even if error. Otherwise we might miss
4680 * detecting a case where the data is changed, there's an error,
4681 * and it's not logged so we don't shutdown when we should.
4682 */
4683 if (bma.logflags)
4684 xfs_trans_log_inode(tp, ip, bma.logflags);
4685
4686 if (bma.cur) {
4687 if (!error) {
4688 ASSERT(*firstblock == NULLFSBLOCK ||
4689 XFS_FSB_TO_AGNO(mp, *firstblock) ==
4690 XFS_FSB_TO_AGNO(mp,
4691 bma.cur->bc_private.b.firstblock) ||
4692 (flist->xbf_low &&
4693 XFS_FSB_TO_AGNO(mp, *firstblock) <
4694 XFS_FSB_TO_AGNO(mp,
4695 bma.cur->bc_private.b.firstblock)));
4696 *firstblock = bma.cur->bc_private.b.firstblock;
4697 }
4698 xfs_btree_del_cursor(bma.cur,
4699 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
4700 }
4701 if (!error)
4702 xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
4703 orig_nmap, *nmap);
4704 return error;
4705}
4706
4707/*
4708 * Called by xfs_bmapi to update file extent records and the btree
4709 * after removing space (or undoing a delayed allocation).
4710 */
4711STATIC int /* error */
4712xfs_bmap_del_extent(
4713 xfs_inode_t *ip, /* incore inode pointer */
4714 xfs_trans_t *tp, /* current transaction pointer */
4715 xfs_extnum_t *idx, /* extent number to update/delete */
4716 xfs_bmap_free_t *flist, /* list of extents to be freed */
4717 xfs_btree_cur_t *cur, /* if null, not a btree */
4718 xfs_bmbt_irec_t *del, /* data to remove from extents */
4719 int *logflagsp, /* inode logging flags */
4720 int whichfork) /* data or attr fork */
4721{
4722 xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
4723 xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
4724 xfs_fsblock_t del_endblock=0; /* first block past del */
4725 xfs_fileoff_t del_endoff; /* first offset past del */
4726 int delay; /* current block is delayed allocated */
4727 int do_fx; /* free extent at end of routine */
4728 xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */
4729 int error; /* error return value */
4730 int flags; /* inode logging flags */
4731 xfs_bmbt_irec_t got; /* current extent entry */
4732 xfs_fileoff_t got_endoff; /* first offset past got */
4733 int i; /* temp state */
4734 xfs_ifork_t *ifp; /* inode fork pointer */
4735 xfs_mount_t *mp; /* mount structure */
4736 xfs_filblks_t nblks; /* quota/sb block count */
4737 xfs_bmbt_irec_t new; /* new record to be inserted */
4738 /* REFERENCED */
4739 uint qfield; /* quota field to update */
4740 xfs_filblks_t temp; /* for indirect length calculations */
4741 xfs_filblks_t temp2; /* for indirect length calculations */
4742 int state = 0;
4743
4744 XFS_STATS_INC(xs_del_exlist);
4745
4746 if (whichfork == XFS_ATTR_FORK)
4747 state |= BMAP_ATTRFORK;
4748
4749 mp = ip->i_mount;
4750 ifp = XFS_IFORK_PTR(ip, whichfork);
4751 ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
4752 (uint)sizeof(xfs_bmbt_rec_t)));
4753 ASSERT(del->br_blockcount > 0);
4754 ep = xfs_iext_get_ext(ifp, *idx);
4755 xfs_bmbt_get_all(ep, &got);
4756 ASSERT(got.br_startoff <= del->br_startoff);
4757 del_endoff = del->br_startoff + del->br_blockcount;
4758 got_endoff = got.br_startoff + got.br_blockcount;
4759 ASSERT(got_endoff >= del_endoff);
4760 delay = isnullstartblock(got.br_startblock);
4761 ASSERT(isnullstartblock(del->br_startblock) == delay);
4762 flags = 0;
4763 qfield = 0;
4764 error = 0;
4765 /*
4766 * If deleting a real allocation, must free up the disk space.
4767 */
4768 if (!delay) {
4769 flags = XFS_ILOG_CORE;
4770 /*
4771 * Realtime allocation. Free it and record di_nblocks update.
4772 */
4773 if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
4774 xfs_fsblock_t bno;
4775 xfs_filblks_t len;
4776
4777 ASSERT(do_mod(del->br_blockcount,
4778 mp->m_sb.sb_rextsize) == 0);
4779 ASSERT(do_mod(del->br_startblock,
4780 mp->m_sb.sb_rextsize) == 0);
4781 bno = del->br_startblock;
4782 len = del->br_blockcount;
4783 do_div(bno, mp->m_sb.sb_rextsize);
4784 do_div(len, mp->m_sb.sb_rextsize);
4785 error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
4786 if (error)
4787 goto done;
4788 do_fx = 0;
4789 nblks = len * mp->m_sb.sb_rextsize;
4790 qfield = XFS_TRANS_DQ_RTBCOUNT;
4791 }
4792 /*
4793 * Ordinary allocation.
4794 */
4795 else {
4796 do_fx = 1;
4797 nblks = del->br_blockcount;
4798 qfield = XFS_TRANS_DQ_BCOUNT;
4799 }
4800 /*
4801 * Set up del_endblock and cur for later.
4802 */
4803 del_endblock = del->br_startblock + del->br_blockcount;
4804 if (cur) {
4805 if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
4806 got.br_startblock, got.br_blockcount,
4807 &i)))
4808 goto done;
4809 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
4810 }
4811 da_old = da_new = 0;
4812 } else {
4813 da_old = startblockval(got.br_startblock);
4814 da_new = 0;
4815 nblks = 0;
4816 do_fx = 0;
4817 }
4818 /*
4819 * Set flag value to use in switch statement.
4820 * Left-contig is 2, right-contig is 1.
4821 */
4822 switch (((got.br_startoff == del->br_startoff) << 1) |
4823 (got_endoff == del_endoff)) {
4824 case 3:
4825 /*
4826 * Matches the whole extent. Delete the entry.
4827 */
4828 xfs_iext_remove(ip, *idx, 1,
4829 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
4830 --*idx;
4831 if (delay)
4832 break;
4833
4834 XFS_IFORK_NEXT_SET(ip, whichfork,
4835 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
4836 flags |= XFS_ILOG_CORE;
4837 if (!cur) {
4838 flags |= xfs_ilog_fext(whichfork);
4839 break;
4840 }
4841 if ((error = xfs_btree_delete(cur, &i)))
4842 goto done;
4843 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
4844 break;
4845
4846 case 2:
4847 /*
4848 * Deleting the first part of the extent.
4849 */
4850 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
4851 xfs_bmbt_set_startoff(ep, del_endoff);
4852 temp = got.br_blockcount - del->br_blockcount;
4853 xfs_bmbt_set_blockcount(ep, temp);
4854 if (delay) {
4855 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
4856 da_old);
4857 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
4858 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
4859 da_new = temp;
4860 break;
4861 }
4862 xfs_bmbt_set_startblock(ep, del_endblock);
4863 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
4864 if (!cur) {
4865 flags |= xfs_ilog_fext(whichfork);
4866 break;
4867 }
4868 if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
4869 got.br_blockcount - del->br_blockcount,
4870 got.br_state)))
4871 goto done;
4872 break;
4873
4874 case 1:
4875 /*
4876 * Deleting the last part of the extent.
4877 */
4878 temp = got.br_blockcount - del->br_blockcount;
4879 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
4880 xfs_bmbt_set_blockcount(ep, temp);
4881 if (delay) {
4882 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
4883 da_old);
4884 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
4885 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
4886 da_new = temp;
4887 break;
4888 }
4889 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
4890 if (!cur) {
4891 flags |= xfs_ilog_fext(whichfork);
4892 break;
4893 }
4894 if ((error = xfs_bmbt_update(cur, got.br_startoff,
4895 got.br_startblock,
4896 got.br_blockcount - del->br_blockcount,
4897 got.br_state)))
4898 goto done;
4899 break;
4900
4901 case 0:
4902 /*
4903 * Deleting the middle of the extent.
4904 */
4905 temp = del->br_startoff - got.br_startoff;
4906 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
4907 xfs_bmbt_set_blockcount(ep, temp);
4908 new.br_startoff = del_endoff;
4909 temp2 = got_endoff - del_endoff;
4910 new.br_blockcount = temp2;
4911 new.br_state = got.br_state;
4912 if (!delay) {
4913 new.br_startblock = del_endblock;
4914 flags |= XFS_ILOG_CORE;
4915 if (cur) {
4916 if ((error = xfs_bmbt_update(cur,
4917 got.br_startoff,
4918 got.br_startblock, temp,
4919 got.br_state)))
4920 goto done;
4921 if ((error = xfs_btree_increment(cur, 0, &i)))
4922 goto done;
4923 cur->bc_rec.b = new;
4924 error = xfs_btree_insert(cur, &i);
4925 if (error && error != -ENOSPC)
4926 goto done;
4927 /*
4928 * If get no-space back from btree insert,
4929 * it tried a split, and we have a zero
4930 * block reservation.
4931 * Fix up our state and return the error.
4932 */
4933 if (error == -ENOSPC) {
4934 /*
4935 * Reset the cursor, don't trust
4936 * it after any insert operation.
4937 */
4938 if ((error = xfs_bmbt_lookup_eq(cur,
4939 got.br_startoff,
4940 got.br_startblock,
4941 temp, &i)))
4942 goto done;
4943 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
4944 /*
4945 * Update the btree record back
4946 * to the original value.
4947 */
4948 if ((error = xfs_bmbt_update(cur,
4949 got.br_startoff,
4950 got.br_startblock,
4951 got.br_blockcount,
4952 got.br_state)))
4953 goto done;
4954 /*
4955 * Reset the extent record back
4956 * to the original value.
4957 */
4958 xfs_bmbt_set_blockcount(ep,
4959 got.br_blockcount);
4960 flags = 0;
4961 error = -ENOSPC;
4962 goto done;
4963 }
4964 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
4965 } else
4966 flags |= xfs_ilog_fext(whichfork);
4967 XFS_IFORK_NEXT_SET(ip, whichfork,
4968 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
4969 } else {
4970 ASSERT(whichfork == XFS_DATA_FORK);
4971 temp = xfs_bmap_worst_indlen(ip, temp);
4972 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
4973 temp2 = xfs_bmap_worst_indlen(ip, temp2);
4974 new.br_startblock = nullstartblock((int)temp2);
4975 da_new = temp + temp2;
4976 while (da_new > da_old) {
4977 if (temp) {
4978 temp--;
4979 da_new--;
4980 xfs_bmbt_set_startblock(ep,
4981 nullstartblock((int)temp));
4982 }
4983 if (da_new == da_old)
4984 break;
4985 if (temp2) {
4986 temp2--;
4987 da_new--;
4988 new.br_startblock =
4989 nullstartblock((int)temp2);
4990 }
4991 }
4992 }
4993 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
4994 xfs_iext_insert(ip, *idx + 1, 1, &new, state);
4995 ++*idx;
4996 break;
4997 }
4998 /*
4999 * If we need to, add to list of extents to delete.
5000 */
5001 if (do_fx)
5002 xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
5003 mp);
5004 /*
5005 * Adjust inode # blocks in the file.
5006 */
5007 if (nblks)
5008 ip->i_d.di_nblocks -= nblks;
5009 /*
5010 * Adjust quota data.
5011 */
5012 if (qfield)
5013 xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
5014
5015 /*
5016 * Account for change in delayed indirect blocks.
5017 * Nothing to do for disk quota accounting here.
5018 */
5019 ASSERT(da_old >= da_new);
5020 if (da_old > da_new) {
5021 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
5022 (int64_t)(da_old - da_new), 0);
5023 }
5024done:
5025 *logflagsp = flags;
5026 return error;
5027}
5028
5029/*
5030 * Unmap (remove) blocks from a file.
5031 * If nexts is nonzero then the number of extents to remove is limited to
5032 * that value. If not all extents in the block range can be removed then
5033 * *done is set.
5034 */
5035int /* error */
5036xfs_bunmapi(
5037 xfs_trans_t *tp, /* transaction pointer */
5038 struct xfs_inode *ip, /* incore inode */
5039 xfs_fileoff_t bno, /* starting offset to unmap */
5040 xfs_filblks_t len, /* length to unmap in file */
5041 int flags, /* misc flags */
5042 xfs_extnum_t nexts, /* number of extents max */
5043 xfs_fsblock_t *firstblock, /* first allocated block
5044 controls a.g. for allocs */
5045 xfs_bmap_free_t *flist, /* i/o: list extents to free */
5046 int *done) /* set if not done yet */
5047{
5048 xfs_btree_cur_t *cur; /* bmap btree cursor */
5049 xfs_bmbt_irec_t del; /* extent being deleted */
5050 int eof; /* is deleting at eof */
5051 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
5052 int error; /* error return value */
5053 xfs_extnum_t extno; /* extent number in list */
5054 xfs_bmbt_irec_t got; /* current extent record */
5055 xfs_ifork_t *ifp; /* inode fork pointer */
5056 int isrt; /* freeing in rt area */
5057 xfs_extnum_t lastx; /* last extent index used */
5058 int logflags; /* transaction logging flags */
5059 xfs_extlen_t mod; /* rt extent offset */
5060 xfs_mount_t *mp; /* mount structure */
5061 xfs_extnum_t nextents; /* number of file extents */
5062 xfs_bmbt_irec_t prev; /* previous extent record */
5063 xfs_fileoff_t start; /* first file offset deleted */
5064 int tmp_logflags; /* partial logging flags */
5065 int wasdel; /* was a delayed alloc extent */
5066 int whichfork; /* data or attribute fork */
5067 xfs_fsblock_t sum;
5068
5069 trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
5070
5071 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
5072 XFS_ATTR_FORK : XFS_DATA_FORK;
5073 ifp = XFS_IFORK_PTR(ip, whichfork);
5074 if (unlikely(
5075 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
5076 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
5077 XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
5078 ip->i_mount);
5079 return -EFSCORRUPTED;
5080 }
5081 mp = ip->i_mount;
5082 if (XFS_FORCED_SHUTDOWN(mp))
5083 return -EIO;
5084
5085 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
5086 ASSERT(len > 0);
5087 ASSERT(nexts >= 0);
5088
5089 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5090 (error = xfs_iread_extents(tp, ip, whichfork)))
5091 return error;
5092 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
5093 if (nextents == 0) {
5094 *done = 1;
5095 return 0;
5096 }
5097 XFS_STATS_INC(xs_blk_unmap);
5098 isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
5099 start = bno;
5100 bno = start + len - 1;
5101 ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
5102 &prev);
5103
5104 /*
5105 * Check to see if the given block number is past the end of the
5106 * file, back up to the last block if so...
5107 */
5108 if (eof) {
5109 ep = xfs_iext_get_ext(ifp, --lastx);
5110 xfs_bmbt_get_all(ep, &got);
5111 bno = got.br_startoff + got.br_blockcount - 1;
5112 }
5113 logflags = 0;
5114 if (ifp->if_flags & XFS_IFBROOT) {
5115 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
5116 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
5117 cur->bc_private.b.firstblock = *firstblock;
5118 cur->bc_private.b.flist = flist;
5119 cur->bc_private.b.flags = 0;
5120 } else
5121 cur = NULL;
5122
5123 if (isrt) {
5124 /*
5125 * Synchronize by locking the bitmap inode.
5126 */
5127 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
5128 xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
5129 }
5130
5131 extno = 0;
5132 while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
5133 (nexts == 0 || extno < nexts)) {
5134 /*
5135 * Is the found extent after a hole in which bno lives?
5136 * Just back up to the previous extent, if so.
5137 */
5138 if (got.br_startoff > bno) {
5139 if (--lastx < 0)
5140 break;
5141 ep = xfs_iext_get_ext(ifp, lastx);
5142 xfs_bmbt_get_all(ep, &got);
5143 }
5144 /*
5145 * Is the last block of this extent before the range
5146 * we're supposed to delete? If so, we're done.
5147 */
5148 bno = XFS_FILEOFF_MIN(bno,
5149 got.br_startoff + got.br_blockcount - 1);
5150 if (bno < start)
5151 break;
5152 /*
5153 * Then deal with the (possibly delayed) allocated space
5154 * we found.
5155 */
5156 ASSERT(ep != NULL);
5157 del = got;
5158 wasdel = isnullstartblock(del.br_startblock);
5159 if (got.br_startoff < start) {
5160 del.br_startoff = start;
5161 del.br_blockcount -= start - got.br_startoff;
5162 if (!wasdel)
5163 del.br_startblock += start - got.br_startoff;
5164 }
5165 if (del.br_startoff + del.br_blockcount > bno + 1)
5166 del.br_blockcount = bno + 1 - del.br_startoff;
5167 sum = del.br_startblock + del.br_blockcount;
5168 if (isrt &&
5169 (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
5170 /*
5171 * Realtime extent not lined up at the end.
5172 * The extent could have been split into written
5173 * and unwritten pieces, or we could just be
5174 * unmapping part of it. But we can't really
5175 * get rid of part of a realtime extent.
5176 */
5177 if (del.br_state == XFS_EXT_UNWRITTEN ||
5178 !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
5179 /*
5180 * This piece is unwritten, or we're not
5181 * using unwritten extents. Skip over it.
5182 */
5183 ASSERT(bno >= mod);
5184 bno -= mod > del.br_blockcount ?
5185 del.br_blockcount : mod;
5186 if (bno < got.br_startoff) {
5187 if (--lastx >= 0)
5188 xfs_bmbt_get_all(xfs_iext_get_ext(
5189 ifp, lastx), &got);
5190 }
5191 continue;
5192 }
5193 /*
5194 * It's written, turn it unwritten.
5195 * This is better than zeroing it.
5196 */
5197 ASSERT(del.br_state == XFS_EXT_NORM);
5198 ASSERT(xfs_trans_get_block_res(tp) > 0);
5199 /*
5200 * If this spans a realtime extent boundary,
5201 * chop it back to the start of the one we end at.
5202 */
5203 if (del.br_blockcount > mod) {
5204 del.br_startoff += del.br_blockcount - mod;
5205 del.br_startblock += del.br_blockcount - mod;
5206 del.br_blockcount = mod;
5207 }
5208 del.br_state = XFS_EXT_UNWRITTEN;
5209 error = xfs_bmap_add_extent_unwritten_real(tp, ip,
5210 &lastx, &cur, &del, firstblock, flist,
5211 &logflags);
5212 if (error)
5213 goto error0;
5214 goto nodelete;
5215 }
5216 if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
5217 /*
5218 * Realtime extent is lined up at the end but not
5219 * at the front. We'll get rid of full extents if
5220 * we can.
5221 */
5222 mod = mp->m_sb.sb_rextsize - mod;
5223 if (del.br_blockcount > mod) {
5224 del.br_blockcount -= mod;
5225 del.br_startoff += mod;
5226 del.br_startblock += mod;
5227 } else if ((del.br_startoff == start &&
5228 (del.br_state == XFS_EXT_UNWRITTEN ||
5229 xfs_trans_get_block_res(tp) == 0)) ||
5230 !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
5231 /*
5232 * Can't make it unwritten. There isn't
5233 * a full extent here so just skip it.
5234 */
5235 ASSERT(bno >= del.br_blockcount);
5236 bno -= del.br_blockcount;
5237 if (got.br_startoff > bno) {
5238 if (--lastx >= 0) {
5239 ep = xfs_iext_get_ext(ifp,
5240 lastx);
5241 xfs_bmbt_get_all(ep, &got);
5242 }
5243 }
5244 continue;
5245 } else if (del.br_state == XFS_EXT_UNWRITTEN) {
5246 /*
5247 * This one is already unwritten.
5248 * It must have a written left neighbor.
5249 * Unwrite the killed part of that one and
5250 * try again.
5251 */
5252 ASSERT(lastx > 0);
5253 xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
5254 lastx - 1), &prev);
5255 ASSERT(prev.br_state == XFS_EXT_NORM);
5256 ASSERT(!isnullstartblock(prev.br_startblock));
5257 ASSERT(del.br_startblock ==
5258 prev.br_startblock + prev.br_blockcount);
5259 if (prev.br_startoff < start) {
5260 mod = start - prev.br_startoff;
5261 prev.br_blockcount -= mod;
5262 prev.br_startblock += mod;
5263 prev.br_startoff = start;
5264 }
5265 prev.br_state = XFS_EXT_UNWRITTEN;
5266 lastx--;
5267 error = xfs_bmap_add_extent_unwritten_real(tp,
5268 ip, &lastx, &cur, &prev,
5269 firstblock, flist, &logflags);
5270 if (error)
5271 goto error0;
5272 goto nodelete;
5273 } else {
5274 ASSERT(del.br_state == XFS_EXT_NORM);
5275 del.br_state = XFS_EXT_UNWRITTEN;
5276 error = xfs_bmap_add_extent_unwritten_real(tp,
5277 ip, &lastx, &cur, &del,
5278 firstblock, flist, &logflags);
5279 if (error)
5280 goto error0;
5281 goto nodelete;
5282 }
5283 }
5284 if (wasdel) {
5285 ASSERT(startblockval(del.br_startblock) > 0);
5286 /* Update realtime/data freespace, unreserve quota */
5287 if (isrt) {
5288 xfs_filblks_t rtexts;
5289
5290 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
5291 do_div(rtexts, mp->m_sb.sb_rextsize);
5292 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
5293 (int64_t)rtexts, 0);
5294 (void)xfs_trans_reserve_quota_nblks(NULL,
5295 ip, -((long)del.br_blockcount), 0,
5296 XFS_QMOPT_RES_RTBLKS);
5297 } else {
5298 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
5299 (int64_t)del.br_blockcount, 0);
5300 (void)xfs_trans_reserve_quota_nblks(NULL,
5301 ip, -((long)del.br_blockcount), 0,
5302 XFS_QMOPT_RES_REGBLKS);
5303 }
5304 ip->i_delayed_blks -= del.br_blockcount;
5305 if (cur)
5306 cur->bc_private.b.flags |=
5307 XFS_BTCUR_BPRV_WASDEL;
5308 } else if (cur)
5309 cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
5310 /*
5311 * If it's the case where the directory code is running
5312 * with no block reservation, and the deleted block is in
5313 * the middle of its extent, and the resulting insert
5314 * of an extent would cause transformation to btree format,
5315 * then reject it. The calling code will then swap
5316 * blocks around instead.
5317 * We have to do this now, rather than waiting for the
5318 * conversion to btree format, since the transaction
5319 * will be dirty.
5320 */
5321 if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
5322 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
5323 XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
5324 XFS_IFORK_MAXEXT(ip, whichfork) &&
5325 del.br_startoff > got.br_startoff &&
5326 del.br_startoff + del.br_blockcount <
5327 got.br_startoff + got.br_blockcount) {
5328 error = -ENOSPC;
5329 goto error0;
5330 }
5331 error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
5332 &tmp_logflags, whichfork);
5333 logflags |= tmp_logflags;
5334 if (error)
5335 goto error0;
5336 bno = del.br_startoff - 1;
5337nodelete:
5338 /*
5339 * If not done go on to the next (previous) record.
5340 */
5341 if (bno != (xfs_fileoff_t)-1 && bno >= start) {
5342 if (lastx >= 0) {
5343 ep = xfs_iext_get_ext(ifp, lastx);
5344 if (xfs_bmbt_get_startoff(ep) > bno) {
5345 if (--lastx >= 0)
5346 ep = xfs_iext_get_ext(ifp,
5347 lastx);
5348 }
5349 xfs_bmbt_get_all(ep, &got);
5350 }
5351 extno++;
5352 }
5353 }
5354 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
5355
5356 /*
5357 * Convert to a btree if necessary.
5358 */
5359 if (xfs_bmap_needs_btree(ip, whichfork)) {
5360 ASSERT(cur == NULL);
5361 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
5362 &cur, 0, &tmp_logflags, whichfork);
5363 logflags |= tmp_logflags;
5364 if (error)
5365 goto error0;
5366 }
5367 /*
5368 * transform from btree to extents, give it cur
5369 */
5370 else if (xfs_bmap_wants_extents(ip, whichfork)) {
5371 ASSERT(cur != NULL);
5372 error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
5373 whichfork);
5374 logflags |= tmp_logflags;
5375 if (error)
5376 goto error0;
5377 }
5378 /*
5379 * transform from extents to local?
5380 */
5381 error = 0;
5382error0:
5383 /*
5384 * Log everything. Do this after conversion, there's no point in
5385 * logging the extent records if we've converted to btree format.
5386 */
5387 if ((logflags & xfs_ilog_fext(whichfork)) &&
5388 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
5389 logflags &= ~xfs_ilog_fext(whichfork);
5390 else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
5391 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
5392 logflags &= ~xfs_ilog_fbroot(whichfork);
5393 /*
5394 * Log inode even in the error case, if the transaction
5395 * is dirty we'll need to shut down the filesystem.
5396 */
5397 if (logflags)
5398 xfs_trans_log_inode(tp, ip, logflags);
5399 if (cur) {
5400 if (!error) {
5401 *firstblock = cur->bc_private.b.firstblock;
5402 cur->bc_private.b.allocated = 0;
5403 }
5404 xfs_btree_del_cursor(cur,
5405 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5406 }
5407 return error;
5408}
5409
5410/*
5411 * Shift extent records to the left to cover a hole.
5412 *
5413 * The maximum number of extents to be shifted in a single operation
5414 * is @num_exts, and @current_ext keeps track of the current extent
5415 * index we have shifted. @offset_shift_fsb is the length by which each
5416 * extent is shifted. If there is no hole to shift the extents
5417 * into, this will be considered invalid operation and we abort immediately.
5418 */
5419int
5420xfs_bmap_shift_extents(
5421 struct xfs_trans *tp,
5422 struct xfs_inode *ip,
5423 int *done,
5424 xfs_fileoff_t start_fsb,
5425 xfs_fileoff_t offset_shift_fsb,
5426 xfs_extnum_t *current_ext,
5427 xfs_fsblock_t *firstblock,
5428 struct xfs_bmap_free *flist,
5429 int num_exts)
5430{
5431 struct xfs_btree_cur *cur;
5432 struct xfs_bmbt_rec_host *gotp;
5433 struct xfs_bmbt_irec got;
5434 struct xfs_bmbt_irec left;
5435 struct xfs_mount *mp = ip->i_mount;
5436 struct xfs_ifork *ifp;
5437 xfs_extnum_t nexts = 0;
5438 xfs_fileoff_t startoff;
5439 int error = 0;
5440 int i;
5441 int whichfork = XFS_DATA_FORK;
5442 int logflags;
5443 xfs_filblks_t blockcount = 0;
5444 int total_extents;
5445
5446 if (unlikely(XFS_TEST_ERROR(
5447 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
5448 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
5449 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
5450 XFS_ERROR_REPORT("xfs_bmap_shift_extents",
5451 XFS_ERRLEVEL_LOW, mp);
5452 return -EFSCORRUPTED;
5453 }
5454
5455 if (XFS_FORCED_SHUTDOWN(mp))
5456 return -EIO;
5457
5458 ASSERT(current_ext != NULL);
5459
5460 ifp = XFS_IFORK_PTR(ip, whichfork);
5461 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
5462 /* Read in all the extents */
5463 error = xfs_iread_extents(tp, ip, whichfork);
5464 if (error)
5465 return error;
5466 }
5467
5468 /*
5469 * If *current_ext is 0, we would need to lookup the extent
5470 * from where we would start shifting and store it in gotp.
5471 */
5472 if (!*current_ext) {
5473 gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
5474 /*
5475 * gotp can be null in 2 cases: 1) if there are no extents
5476 * or 2) start_fsb lies in a hole beyond which there are
5477 * no extents. Either way, we are done.
5478 */
5479 if (!gotp) {
5480 *done = 1;
5481 return 0;
5482 }
5483 }
5484
5485 /* We are going to change core inode */
5486 logflags = XFS_ILOG_CORE;
5487 if (ifp->if_flags & XFS_IFBROOT) {
5488 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
5489 cur->bc_private.b.firstblock = *firstblock;
5490 cur->bc_private.b.flist = flist;
5491 cur->bc_private.b.flags = 0;
5492 } else {
5493 cur = NULL;
5494 logflags |= XFS_ILOG_DEXT;
5495 }
5496
5497 /*
5498 * There may be delalloc extents in the data fork before the range we
5499 * are collapsing out, so we cannot
5500 * use the count of real extents here. Instead we have to calculate it
5501 * from the incore fork.
5502 */
5503 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
5504 while (nexts++ < num_exts && *current_ext < total_extents) {
5505
5506 gotp = xfs_iext_get_ext(ifp, *current_ext);
5507 xfs_bmbt_get_all(gotp, &got);
5508 startoff = got.br_startoff - offset_shift_fsb;
5509
5510 /*
5511 * Before shifting extent into hole, make sure that the hole
5512 * is large enough to accomodate the shift.
5513 */
5514 if (*current_ext) {
5515 xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
5516 *current_ext - 1), &left);
5517
5518 if (startoff < left.br_startoff + left.br_blockcount)
5519 error = -EINVAL;
5520 } else if (offset_shift_fsb > got.br_startoff) {
5521 /*
5522 * When first extent is shifted, offset_shift_fsb
5523 * should be less than the stating offset of
5524 * the first extent.
5525 */
5526 error = -EINVAL;
5527 }
5528
5529 if (error)
5530 goto del_cursor;
5531
5532 if (cur) {
5533 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5534 got.br_startblock,
5535 got.br_blockcount,
5536 &i);
5537 if (error)
5538 goto del_cursor;
5539 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5540 }
5541
5542 /* Check if we can merge 2 adjacent extents */
5543 if (*current_ext &&
5544 left.br_startoff + left.br_blockcount == startoff &&
5545 left.br_startblock + left.br_blockcount ==
5546 got.br_startblock &&
5547 left.br_state == got.br_state &&
5548 left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
5549 blockcount = left.br_blockcount +
5550 got.br_blockcount;
5551 xfs_iext_remove(ip, *current_ext, 1, 0);
5552 if (cur) {
5553 error = xfs_btree_delete(cur, &i);
5554 if (error)
5555 goto del_cursor;
5556 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5557 }
5558 XFS_IFORK_NEXT_SET(ip, whichfork,
5559 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
5560 gotp = xfs_iext_get_ext(ifp, --*current_ext);
5561 xfs_bmbt_get_all(gotp, &got);
5562
5563 /* Make cursor point to the extent we will update */
5564 if (cur) {
5565 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5566 got.br_startblock,
5567 got.br_blockcount,
5568 &i);
5569 if (error)
5570 goto del_cursor;
5571 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5572 }
5573
5574 xfs_bmbt_set_blockcount(gotp, blockcount);
5575 got.br_blockcount = blockcount;
5576 } else {
5577 /* We have to update the startoff */
5578 xfs_bmbt_set_startoff(gotp, startoff);
5579 got.br_startoff = startoff;
5580 }
5581
5582 if (cur) {
5583 error = xfs_bmbt_update(cur, got.br_startoff,
5584 got.br_startblock,
5585 got.br_blockcount,
5586 got.br_state);
5587 if (error)
5588 goto del_cursor;
5589 }
5590
5591 (*current_ext)++;
5592 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
5593 }
5594
5595 /* Check if we are done */
5596 if (*current_ext == total_extents)
5597 *done = 1;
5598
5599del_cursor:
5600 if (cur)
5601 xfs_btree_del_cursor(cur,
5602 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5603
5604 xfs_trans_log_inode(tp, ip, logflags);
5605 return error;
5606}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
new file mode 100644
index 000000000000..b879ca56a64c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -0,0 +1,186 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_BMAP_H__
19#define __XFS_BMAP_H__
20
21struct getbmap;
22struct xfs_bmbt_irec;
23struct xfs_ifork;
24struct xfs_inode;
25struct xfs_mount;
26struct xfs_trans;
27
28extern kmem_zone_t *xfs_bmap_free_item_zone;
29
30/*
31 * List of extents to be free "later".
32 * The list is kept sorted on xbf_startblock.
33 */
34typedef struct xfs_bmap_free_item
35{
36 xfs_fsblock_t xbfi_startblock;/* starting fs block number */
37 xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */
38 struct xfs_bmap_free_item *xbfi_next; /* link to next entry */
39} xfs_bmap_free_item_t;
40
41/*
42 * Header for free extent list.
43 *
44 * xbf_low is used by the allocator to activate the lowspace algorithm -
45 * when free space is running low the extent allocator may choose to
46 * allocate an extent from an AG without leaving sufficient space for
47 * a btree split when inserting the new extent. In this case the allocator
48 * will enable the lowspace algorithm which is supposed to allow further
49 * allocations (such as btree splits and newroots) to allocate from
50 * sequential AGs. In order to avoid locking AGs out of order the lowspace
51 * algorithm will start searching for free space from AG 0. If the correct
52 * transaction reservations have been made then this algorithm will eventually
53 * find all the space it needs.
54 */
55typedef struct xfs_bmap_free
56{
57 xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */
58 int xbf_count; /* count of items on list */
59 int xbf_low; /* alloc in low mode */
60} xfs_bmap_free_t;
61
62#define XFS_BMAP_MAX_NMAP 4
63
64/*
65 * Flags for xfs_bmapi_*
66 */
67#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
68#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
69#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
70#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
71#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */
72 /* combine contig. space */
73#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
74/*
75 * unwritten extent conversion - this needs write cache flushing and no additional
76 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
77 * from written to unwritten, otherwise convert from unwritten to written.
78 */
79#define XFS_BMAPI_CONVERT 0x040
80
81#define XFS_BMAPI_FLAGS \
82 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
83 { XFS_BMAPI_METADATA, "METADATA" }, \
84 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
85 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
86 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
87 { XFS_BMAPI_CONTIG, "CONTIG" }, \
88 { XFS_BMAPI_CONVERT, "CONVERT" }
89
90
91static inline int xfs_bmapi_aflag(int w)
92{
93 return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
94}
95
96/*
97 * Special values for xfs_bmbt_irec_t br_startblock field.
98 */
99#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
100#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
101
102static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
103{
104 ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
105 (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
106}
107
108/*
109 * Flags for xfs_bmap_add_extent*.
110 */
111#define BMAP_LEFT_CONTIG (1 << 0)
112#define BMAP_RIGHT_CONTIG (1 << 1)
113#define BMAP_LEFT_FILLING (1 << 2)
114#define BMAP_RIGHT_FILLING (1 << 3)
115#define BMAP_LEFT_DELAY (1 << 4)
116#define BMAP_RIGHT_DELAY (1 << 5)
117#define BMAP_LEFT_VALID (1 << 6)
118#define BMAP_RIGHT_VALID (1 << 7)
119#define BMAP_ATTRFORK (1 << 8)
120
121#define XFS_BMAP_EXT_FLAGS \
122 { BMAP_LEFT_CONTIG, "LC" }, \
123 { BMAP_RIGHT_CONTIG, "RC" }, \
124 { BMAP_LEFT_FILLING, "LF" }, \
125 { BMAP_RIGHT_FILLING, "RF" }, \
126 { BMAP_ATTRFORK, "ATTR" }
127
128
129/*
130 * This macro is used to determine how many extents will be shifted
131 * in one write transaction. We could require two splits,
132 * an extent move on the first and an extent merge on the second,
133 * So it is proper that one extent is shifted inside write transaction
134 * at a time.
135 */
136#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
137
138#ifdef DEBUG
139void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
140 int whichfork, unsigned long caller_ip);
141#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
142 xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
143#else
144#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
145#endif
146
147int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
148void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
149void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
150 struct xfs_bmap_free *flist, struct xfs_mount *mp);
151void xfs_bmap_cancel(struct xfs_bmap_free *flist);
152void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
153int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
154 xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
155int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
156 xfs_fileoff_t *last_block, int whichfork);
157int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
158 int whichfork);
159int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
160int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
161 int whichfork);
162int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
163 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
164 int *nmap, int flags);
165int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
166 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
167 int *nmap, int flags);
168int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
169 xfs_fileoff_t bno, xfs_filblks_t len, int flags,
170 xfs_fsblock_t *firstblock, xfs_extlen_t total,
171 struct xfs_bmbt_irec *mval, int *nmap,
172 struct xfs_bmap_free *flist);
173int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
174 xfs_fileoff_t bno, xfs_filblks_t len, int flags,
175 xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
176 struct xfs_bmap_free *flist, int *done);
177int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
178 xfs_extnum_t num);
179uint xfs_default_attroffset(struct xfs_inode *ip);
180int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
181 int *done, xfs_fileoff_t start_fsb,
182 xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
183 xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
184 int num_exts);
185
186#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
new file mode 100644
index 000000000000..a388de4ceaa1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -0,0 +1,967 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_inode.h"
29#include "xfs_trans.h"
30#include "xfs_inode_item.h"
31#include "xfs_alloc.h"
32#include "xfs_btree.h"
33#include "xfs_bmap_btree.h"
34#include "xfs_bmap.h"
35#include "xfs_error.h"
36#include "xfs_quota.h"
37#include "xfs_trace.h"
38#include "xfs_cksum.h"
39#include "xfs_dinode.h"
40
41/*
42 * Determine the extent state.
43 */
44/* ARGSUSED */
45STATIC xfs_exntst_t
46xfs_extent_state(
47 xfs_filblks_t blks,
48 int extent_flag)
49{
50 if (extent_flag) {
51 ASSERT(blks != 0); /* saved for DMIG */
52 return XFS_EXT_UNWRITTEN;
53 }
54 return XFS_EXT_NORM;
55}
56
57/*
58 * Convert on-disk form of btree root to in-memory form.
59 */
60void
61xfs_bmdr_to_bmbt(
62 struct xfs_inode *ip,
63 xfs_bmdr_block_t *dblock,
64 int dblocklen,
65 struct xfs_btree_block *rblock,
66 int rblocklen)
67{
68 struct xfs_mount *mp = ip->i_mount;
69 int dmxr;
70 xfs_bmbt_key_t *fkp;
71 __be64 *fpp;
72 xfs_bmbt_key_t *tkp;
73 __be64 *tpp;
74
75 if (xfs_sb_version_hascrc(&mp->m_sb))
76 xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
77 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
78 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
79 else
80 xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
81 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
82 XFS_BTREE_LONG_PTRS);
83
84 rblock->bb_level = dblock->bb_level;
85 ASSERT(be16_to_cpu(rblock->bb_level) > 0);
86 rblock->bb_numrecs = dblock->bb_numrecs;
87 dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
88 fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
89 tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
90 fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
91 tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
92 dmxr = be16_to_cpu(dblock->bb_numrecs);
93 memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
94 memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
95}
96
97/*
98 * Convert a compressed bmap extent record to an uncompressed form.
99 * This code must be in sync with the routines xfs_bmbt_get_startoff,
100 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
101 */
102STATIC void
103__xfs_bmbt_get_all(
104 __uint64_t l0,
105 __uint64_t l1,
106 xfs_bmbt_irec_t *s)
107{
108 int ext_flag;
109 xfs_exntst_t st;
110
111 ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
112 s->br_startoff = ((xfs_fileoff_t)l0 &
113 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
114#if XFS_BIG_BLKNOS
115 s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
116 (((xfs_fsblock_t)l1) >> 21);
117#else
118#ifdef DEBUG
119 {
120 xfs_dfsbno_t b;
121
122 b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
123 (((xfs_dfsbno_t)l1) >> 21);
124 ASSERT((b >> 32) == 0 || isnulldstartblock(b));
125 s->br_startblock = (xfs_fsblock_t)b;
126 }
127#else /* !DEBUG */
128 s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
129#endif /* DEBUG */
130#endif /* XFS_BIG_BLKNOS */
131 s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
132 /* This is xfs_extent_state() in-line */
133 if (ext_flag) {
134 ASSERT(s->br_blockcount != 0); /* saved for DMIG */
135 st = XFS_EXT_UNWRITTEN;
136 } else
137 st = XFS_EXT_NORM;
138 s->br_state = st;
139}
140
141void
142xfs_bmbt_get_all(
143 xfs_bmbt_rec_host_t *r,
144 xfs_bmbt_irec_t *s)
145{
146 __xfs_bmbt_get_all(r->l0, r->l1, s);
147}
148
149/*
150 * Extract the blockcount field from an in memory bmap extent record.
151 */
152xfs_filblks_t
153xfs_bmbt_get_blockcount(
154 xfs_bmbt_rec_host_t *r)
155{
156 return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
157}
158
159/*
160 * Extract the startblock field from an in memory bmap extent record.
161 */
162xfs_fsblock_t
163xfs_bmbt_get_startblock(
164 xfs_bmbt_rec_host_t *r)
165{
166#if XFS_BIG_BLKNOS
167 return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
168 (((xfs_fsblock_t)r->l1) >> 21);
169#else
170#ifdef DEBUG
171 xfs_dfsbno_t b;
172
173 b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
174 (((xfs_dfsbno_t)r->l1) >> 21);
175 ASSERT((b >> 32) == 0 || isnulldstartblock(b));
176 return (xfs_fsblock_t)b;
177#else /* !DEBUG */
178 return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
179#endif /* DEBUG */
180#endif /* XFS_BIG_BLKNOS */
181}
182
183/*
184 * Extract the startoff field from an in memory bmap extent record.
185 */
186xfs_fileoff_t
187xfs_bmbt_get_startoff(
188 xfs_bmbt_rec_host_t *r)
189{
190 return ((xfs_fileoff_t)r->l0 &
191 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
192}
193
194xfs_exntst_t
195xfs_bmbt_get_state(
196 xfs_bmbt_rec_host_t *r)
197{
198 int ext_flag;
199
200 ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
201 return xfs_extent_state(xfs_bmbt_get_blockcount(r),
202 ext_flag);
203}
204
205/*
206 * Extract the blockcount field from an on disk bmap extent record.
207 */
208xfs_filblks_t
209xfs_bmbt_disk_get_blockcount(
210 xfs_bmbt_rec_t *r)
211{
212 return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
213}
214
215/*
216 * Extract the startoff field from a disk format bmap extent record.
217 */
218xfs_fileoff_t
219xfs_bmbt_disk_get_startoff(
220 xfs_bmbt_rec_t *r)
221{
222 return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
223 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
224}
225
226
227/*
228 * Set all the fields in a bmap extent record from the arguments.
229 */
230void
231xfs_bmbt_set_allf(
232 xfs_bmbt_rec_host_t *r,
233 xfs_fileoff_t startoff,
234 xfs_fsblock_t startblock,
235 xfs_filblks_t blockcount,
236 xfs_exntst_t state)
237{
238 int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
239
240 ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
241 ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
242 ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
243
244#if XFS_BIG_BLKNOS
245 ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
246
247 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
248 ((xfs_bmbt_rec_base_t)startoff << 9) |
249 ((xfs_bmbt_rec_base_t)startblock >> 43);
250 r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
251 ((xfs_bmbt_rec_base_t)blockcount &
252 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
253#else /* !XFS_BIG_BLKNOS */
254 if (isnullstartblock(startblock)) {
255 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
256 ((xfs_bmbt_rec_base_t)startoff << 9) |
257 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
258 r->l1 = xfs_mask64hi(11) |
259 ((xfs_bmbt_rec_base_t)startblock << 21) |
260 ((xfs_bmbt_rec_base_t)blockcount &
261 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
262 } else {
263 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
264 ((xfs_bmbt_rec_base_t)startoff << 9);
265 r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
266 ((xfs_bmbt_rec_base_t)blockcount &
267 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
268 }
269#endif /* XFS_BIG_BLKNOS */
270}
271
272/*
273 * Set all the fields in a bmap extent record from the uncompressed form.
274 */
275void
276xfs_bmbt_set_all(
277 xfs_bmbt_rec_host_t *r,
278 xfs_bmbt_irec_t *s)
279{
280 xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
281 s->br_blockcount, s->br_state);
282}
283
284
285/*
286 * Set all the fields in a disk format bmap extent record from the arguments.
287 */
288void
289xfs_bmbt_disk_set_allf(
290 xfs_bmbt_rec_t *r,
291 xfs_fileoff_t startoff,
292 xfs_fsblock_t startblock,
293 xfs_filblks_t blockcount,
294 xfs_exntst_t state)
295{
296 int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
297
298 ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
299 ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
300 ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
301
302#if XFS_BIG_BLKNOS
303 ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
304
305 r->l0 = cpu_to_be64(
306 ((xfs_bmbt_rec_base_t)extent_flag << 63) |
307 ((xfs_bmbt_rec_base_t)startoff << 9) |
308 ((xfs_bmbt_rec_base_t)startblock >> 43));
309 r->l1 = cpu_to_be64(
310 ((xfs_bmbt_rec_base_t)startblock << 21) |
311 ((xfs_bmbt_rec_base_t)blockcount &
312 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
313#else /* !XFS_BIG_BLKNOS */
314 if (isnullstartblock(startblock)) {
315 r->l0 = cpu_to_be64(
316 ((xfs_bmbt_rec_base_t)extent_flag << 63) |
317 ((xfs_bmbt_rec_base_t)startoff << 9) |
318 (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
319 r->l1 = cpu_to_be64(xfs_mask64hi(11) |
320 ((xfs_bmbt_rec_base_t)startblock << 21) |
321 ((xfs_bmbt_rec_base_t)blockcount &
322 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
323 } else {
324 r->l0 = cpu_to_be64(
325 ((xfs_bmbt_rec_base_t)extent_flag << 63) |
326 ((xfs_bmbt_rec_base_t)startoff << 9));
327 r->l1 = cpu_to_be64(
328 ((xfs_bmbt_rec_base_t)startblock << 21) |
329 ((xfs_bmbt_rec_base_t)blockcount &
330 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
331 }
332#endif /* XFS_BIG_BLKNOS */
333}
334
335/*
336 * Set all the fields in a bmap extent record from the uncompressed form.
337 */
338STATIC void
339xfs_bmbt_disk_set_all(
340 xfs_bmbt_rec_t *r,
341 xfs_bmbt_irec_t *s)
342{
343 xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
344 s->br_blockcount, s->br_state);
345}
346
347/*
348 * Set the blockcount field in a bmap extent record.
349 */
350void
351xfs_bmbt_set_blockcount(
352 xfs_bmbt_rec_host_t *r,
353 xfs_filblks_t v)
354{
355 ASSERT((v & xfs_mask64hi(43)) == 0);
356 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
357 (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
358}
359
360/*
361 * Set the startblock field in a bmap extent record.
362 */
363void
364xfs_bmbt_set_startblock(
365 xfs_bmbt_rec_host_t *r,
366 xfs_fsblock_t v)
367{
368#if XFS_BIG_BLKNOS
369 ASSERT((v & xfs_mask64hi(12)) == 0);
370 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
371 (xfs_bmbt_rec_base_t)(v >> 43);
372 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
373 (xfs_bmbt_rec_base_t)(v << 21);
374#else /* !XFS_BIG_BLKNOS */
375 if (isnullstartblock(v)) {
376 r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
377 r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
378 ((xfs_bmbt_rec_base_t)v << 21) |
379 (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
380 } else {
381 r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
382 r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
383 (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
384 }
385#endif /* XFS_BIG_BLKNOS */
386}
387
388/*
389 * Set the startoff field in a bmap extent record.
390 */
391void
392xfs_bmbt_set_startoff(
393 xfs_bmbt_rec_host_t *r,
394 xfs_fileoff_t v)
395{
396 ASSERT((v & xfs_mask64hi(9)) == 0);
397 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
398 ((xfs_bmbt_rec_base_t)v << 9) |
399 (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
400}
401
402/*
403 * Set the extent state field in a bmap extent record.
404 */
405void
406xfs_bmbt_set_state(
407 xfs_bmbt_rec_host_t *r,
408 xfs_exntst_t v)
409{
410 ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
411 if (v == XFS_EXT_NORM)
412 r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
413 else
414 r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
415}
416
417/*
418 * Convert in-memory form of btree root to on-disk form.
419 */
420void
421xfs_bmbt_to_bmdr(
422 struct xfs_mount *mp,
423 struct xfs_btree_block *rblock,
424 int rblocklen,
425 xfs_bmdr_block_t *dblock,
426 int dblocklen)
427{
428 int dmxr;
429 xfs_bmbt_key_t *fkp;
430 __be64 *fpp;
431 xfs_bmbt_key_t *tkp;
432 __be64 *tpp;
433
434 if (xfs_sb_version_hascrc(&mp->m_sb)) {
435 ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
436 ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
437 ASSERT(rblock->bb_u.l.bb_blkno ==
438 cpu_to_be64(XFS_BUF_DADDR_NULL));
439 } else
440 ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
441 ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
442 ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
443 ASSERT(rblock->bb_level != 0);
444 dblock->bb_level = rblock->bb_level;
445 dblock->bb_numrecs = rblock->bb_numrecs;
446 dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
447 fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
448 tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
449 fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
450 tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
451 dmxr = be16_to_cpu(dblock->bb_numrecs);
452 memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
453 memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
454}
455
456/*
457 * Check extent records, which have just been read, for
458 * any bit in the extent flag field. ASSERT on debug
459 * kernels, as this condition should not occur.
460 * Return an error condition (1) if any flags found,
461 * otherwise return 0.
462 */
463
464int
465xfs_check_nostate_extents(
466 xfs_ifork_t *ifp,
467 xfs_extnum_t idx,
468 xfs_extnum_t num)
469{
470 for (; num > 0; num--, idx++) {
471 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
472 if ((ep->l0 >>
473 (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
474 ASSERT(0);
475 return 1;
476 }
477 }
478 return 0;
479}
480
481
482STATIC struct xfs_btree_cur *
483xfs_bmbt_dup_cursor(
484 struct xfs_btree_cur *cur)
485{
486 struct xfs_btree_cur *new;
487
488 new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
489 cur->bc_private.b.ip, cur->bc_private.b.whichfork);
490
491 /*
492 * Copy the firstblock, flist, and flags values,
493 * since init cursor doesn't get them.
494 */
495 new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
496 new->bc_private.b.flist = cur->bc_private.b.flist;
497 new->bc_private.b.flags = cur->bc_private.b.flags;
498
499 return new;
500}
501
502STATIC void
503xfs_bmbt_update_cursor(
504 struct xfs_btree_cur *src,
505 struct xfs_btree_cur *dst)
506{
507 ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
508 (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
509 ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
510
511 dst->bc_private.b.allocated += src->bc_private.b.allocated;
512 dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
513
514 src->bc_private.b.allocated = 0;
515}
516
517STATIC int
518xfs_bmbt_alloc_block(
519 struct xfs_btree_cur *cur,
520 union xfs_btree_ptr *start,
521 union xfs_btree_ptr *new,
522 int *stat)
523{
524 xfs_alloc_arg_t args; /* block allocation args */
525 int error; /* error return value */
526
527 memset(&args, 0, sizeof(args));
528 args.tp = cur->bc_tp;
529 args.mp = cur->bc_mp;
530 args.fsbno = cur->bc_private.b.firstblock;
531 args.firstblock = args.fsbno;
532
533 if (args.fsbno == NULLFSBLOCK) {
534 args.fsbno = be64_to_cpu(start->l);
535 args.type = XFS_ALLOCTYPE_START_BNO;
536 /*
537 * Make sure there is sufficient room left in the AG to
538 * complete a full tree split for an extent insert. If
539 * we are converting the middle part of an extent then
540 * we may need space for two tree splits.
541 *
542 * We are relying on the caller to make the correct block
543 * reservation for this operation to succeed. If the
544 * reservation amount is insufficient then we may fail a
545 * block allocation here and corrupt the filesystem.
546 */
547 args.minleft = xfs_trans_get_block_res(args.tp);
548 } else if (cur->bc_private.b.flist->xbf_low) {
549 args.type = XFS_ALLOCTYPE_START_BNO;
550 } else {
551 args.type = XFS_ALLOCTYPE_NEAR_BNO;
552 }
553
554 args.minlen = args.maxlen = args.prod = 1;
555 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
556 if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
557 error = -ENOSPC;
558 goto error0;
559 }
560 error = xfs_alloc_vextent(&args);
561 if (error)
562 goto error0;
563
564 if (args.fsbno == NULLFSBLOCK && args.minleft) {
565 /*
566 * Could not find an AG with enough free space to satisfy
567 * a full btree split. Try again without minleft and if
568 * successful activate the lowspace algorithm.
569 */
570 args.fsbno = 0;
571 args.type = XFS_ALLOCTYPE_FIRST_AG;
572 args.minleft = 0;
573 error = xfs_alloc_vextent(&args);
574 if (error)
575 goto error0;
576 cur->bc_private.b.flist->xbf_low = 1;
577 }
578 if (args.fsbno == NULLFSBLOCK) {
579 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
580 *stat = 0;
581 return 0;
582 }
583 ASSERT(args.len == 1);
584 cur->bc_private.b.firstblock = args.fsbno;
585 cur->bc_private.b.allocated++;
586 cur->bc_private.b.ip->i_d.di_nblocks++;
587 xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
588 xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
589 XFS_TRANS_DQ_BCOUNT, 1L);
590
591 new->l = cpu_to_be64(args.fsbno);
592
593 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
594 *stat = 1;
595 return 0;
596
597 error0:
598 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
599 return error;
600}
601
602STATIC int
603xfs_bmbt_free_block(
604 struct xfs_btree_cur *cur,
605 struct xfs_buf *bp)
606{
607 struct xfs_mount *mp = cur->bc_mp;
608 struct xfs_inode *ip = cur->bc_private.b.ip;
609 struct xfs_trans *tp = cur->bc_tp;
610 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
611
612 xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
613 ip->i_d.di_nblocks--;
614
615 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
616 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
617 xfs_trans_binval(tp, bp);
618 return 0;
619}
620
621STATIC int
622xfs_bmbt_get_minrecs(
623 struct xfs_btree_cur *cur,
624 int level)
625{
626 if (level == cur->bc_nlevels - 1) {
627 struct xfs_ifork *ifp;
628
629 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
630 cur->bc_private.b.whichfork);
631
632 return xfs_bmbt_maxrecs(cur->bc_mp,
633 ifp->if_broot_bytes, level == 0) / 2;
634 }
635
636 return cur->bc_mp->m_bmap_dmnr[level != 0];
637}
638
639int
640xfs_bmbt_get_maxrecs(
641 struct xfs_btree_cur *cur,
642 int level)
643{
644 if (level == cur->bc_nlevels - 1) {
645 struct xfs_ifork *ifp;
646
647 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
648 cur->bc_private.b.whichfork);
649
650 return xfs_bmbt_maxrecs(cur->bc_mp,
651 ifp->if_broot_bytes, level == 0);
652 }
653
654 return cur->bc_mp->m_bmap_dmxr[level != 0];
655
656}
657
658/*
659 * Get the maximum records we could store in the on-disk format.
660 *
661 * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
662 * for the root node this checks the available space in the dinode fork
663 * so that we can resize the in-memory buffer to match it. After a
664 * resize to the maximum size this function returns the same value
665 * as xfs_bmbt_get_maxrecs for the root node, too.
666 */
667STATIC int
668xfs_bmbt_get_dmaxrecs(
669 struct xfs_btree_cur *cur,
670 int level)
671{
672 if (level != cur->bc_nlevels - 1)
673 return cur->bc_mp->m_bmap_dmxr[level != 0];
674 return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
675}
676
677STATIC void
678xfs_bmbt_init_key_from_rec(
679 union xfs_btree_key *key,
680 union xfs_btree_rec *rec)
681{
682 key->bmbt.br_startoff =
683 cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
684}
685
686STATIC void
687xfs_bmbt_init_rec_from_key(
688 union xfs_btree_key *key,
689 union xfs_btree_rec *rec)
690{
691 ASSERT(key->bmbt.br_startoff != 0);
692
693 xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
694 0, 0, XFS_EXT_NORM);
695}
696
697STATIC void
698xfs_bmbt_init_rec_from_cur(
699 struct xfs_btree_cur *cur,
700 union xfs_btree_rec *rec)
701{
702 xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
703}
704
705STATIC void
706xfs_bmbt_init_ptr_from_cur(
707 struct xfs_btree_cur *cur,
708 union xfs_btree_ptr *ptr)
709{
710 ptr->l = 0;
711}
712
713STATIC __int64_t
714xfs_bmbt_key_diff(
715 struct xfs_btree_cur *cur,
716 union xfs_btree_key *key)
717{
718 return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
719 cur->bc_rec.b.br_startoff;
720}
721
722static bool
723xfs_bmbt_verify(
724 struct xfs_buf *bp)
725{
726 struct xfs_mount *mp = bp->b_target->bt_mount;
727 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
728 unsigned int level;
729
730 switch (block->bb_magic) {
731 case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
732 if (!xfs_sb_version_hascrc(&mp->m_sb))
733 return false;
734 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
735 return false;
736 if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
737 return false;
738 /*
739 * XXX: need a better way of verifying the owner here. Right now
740 * just make sure there has been one set.
741 */
742 if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
743 return false;
744 /* fall through */
745 case cpu_to_be32(XFS_BMAP_MAGIC):
746 break;
747 default:
748 return false;
749 }
750
751 /*
752 * numrecs and level verification.
753 *
754 * We don't know what fork we belong to, so just verify that the level
755 * is less than the maximum of the two. Later checks will be more
756 * precise.
757 */
758 level = be16_to_cpu(block->bb_level);
759 if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
760 return false;
761 if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
762 return false;
763
764 /* sibling pointer verification */
765 if (!block->bb_u.l.bb_leftsib ||
766 (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
767 !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
768 return false;
769 if (!block->bb_u.l.bb_rightsib ||
770 (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
771 !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
772 return false;
773
774 return true;
775}
776
777static void
778xfs_bmbt_read_verify(
779 struct xfs_buf *bp)
780{
781 if (!xfs_btree_lblock_verify_crc(bp))
782 xfs_buf_ioerror(bp, -EFSBADCRC);
783 else if (!xfs_bmbt_verify(bp))
784 xfs_buf_ioerror(bp, -EFSCORRUPTED);
785
786 if (bp->b_error) {
787 trace_xfs_btree_corrupt(bp, _RET_IP_);
788 xfs_verifier_error(bp);
789 }
790}
791
792static void
793xfs_bmbt_write_verify(
794 struct xfs_buf *bp)
795{
796 if (!xfs_bmbt_verify(bp)) {
797 trace_xfs_btree_corrupt(bp, _RET_IP_);
798 xfs_buf_ioerror(bp, -EFSCORRUPTED);
799 xfs_verifier_error(bp);
800 return;
801 }
802 xfs_btree_lblock_calc_crc(bp);
803}
804
805const struct xfs_buf_ops xfs_bmbt_buf_ops = {
806 .verify_read = xfs_bmbt_read_verify,
807 .verify_write = xfs_bmbt_write_verify,
808};
809
810
811#if defined(DEBUG) || defined(XFS_WARN)
812STATIC int
813xfs_bmbt_keys_inorder(
814 struct xfs_btree_cur *cur,
815 union xfs_btree_key *k1,
816 union xfs_btree_key *k2)
817{
818 return be64_to_cpu(k1->bmbt.br_startoff) <
819 be64_to_cpu(k2->bmbt.br_startoff);
820}
821
822STATIC int
823xfs_bmbt_recs_inorder(
824 struct xfs_btree_cur *cur,
825 union xfs_btree_rec *r1,
826 union xfs_btree_rec *r2)
827{
828 return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
829 xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
830 xfs_bmbt_disk_get_startoff(&r2->bmbt);
831}
832#endif /* DEBUG */
833
834static const struct xfs_btree_ops xfs_bmbt_ops = {
835 .rec_len = sizeof(xfs_bmbt_rec_t),
836 .key_len = sizeof(xfs_bmbt_key_t),
837
838 .dup_cursor = xfs_bmbt_dup_cursor,
839 .update_cursor = xfs_bmbt_update_cursor,
840 .alloc_block = xfs_bmbt_alloc_block,
841 .free_block = xfs_bmbt_free_block,
842 .get_maxrecs = xfs_bmbt_get_maxrecs,
843 .get_minrecs = xfs_bmbt_get_minrecs,
844 .get_dmaxrecs = xfs_bmbt_get_dmaxrecs,
845 .init_key_from_rec = xfs_bmbt_init_key_from_rec,
846 .init_rec_from_key = xfs_bmbt_init_rec_from_key,
847 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
848 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
849 .key_diff = xfs_bmbt_key_diff,
850 .buf_ops = &xfs_bmbt_buf_ops,
851#if defined(DEBUG) || defined(XFS_WARN)
852 .keys_inorder = xfs_bmbt_keys_inorder,
853 .recs_inorder = xfs_bmbt_recs_inorder,
854#endif
855};
856
857/*
858 * Allocate a new bmap btree cursor.
859 */
860struct xfs_btree_cur * /* new bmap btree cursor */
861xfs_bmbt_init_cursor(
862 struct xfs_mount *mp, /* file system mount point */
863 struct xfs_trans *tp, /* transaction pointer */
864 struct xfs_inode *ip, /* inode owning the btree */
865 int whichfork) /* data or attr fork */
866{
867 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
868 struct xfs_btree_cur *cur;
869
870 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
871
872 cur->bc_tp = tp;
873 cur->bc_mp = mp;
874 cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
875 cur->bc_btnum = XFS_BTNUM_BMAP;
876 cur->bc_blocklog = mp->m_sb.sb_blocklog;
877
878 cur->bc_ops = &xfs_bmbt_ops;
879 cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
880 if (xfs_sb_version_hascrc(&mp->m_sb))
881 cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
882
883 cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
884 cur->bc_private.b.ip = ip;
885 cur->bc_private.b.firstblock = NULLFSBLOCK;
886 cur->bc_private.b.flist = NULL;
887 cur->bc_private.b.allocated = 0;
888 cur->bc_private.b.flags = 0;
889 cur->bc_private.b.whichfork = whichfork;
890
891 return cur;
892}
893
894/*
895 * Calculate number of records in a bmap btree block.
896 */
897int
898xfs_bmbt_maxrecs(
899 struct xfs_mount *mp,
900 int blocklen,
901 int leaf)
902{
903 blocklen -= XFS_BMBT_BLOCK_LEN(mp);
904
905 if (leaf)
906 return blocklen / sizeof(xfs_bmbt_rec_t);
907 return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
908}
909
910/*
911 * Calculate number of records in a bmap btree inode root.
912 */
913int
914xfs_bmdr_maxrecs(
915 int blocklen,
916 int leaf)
917{
918 blocklen -= sizeof(xfs_bmdr_block_t);
919
920 if (leaf)
921 return blocklen / sizeof(xfs_bmdr_rec_t);
922 return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
923}
924
925/*
926 * Change the owner of a btree format fork fo the inode passed in. Change it to
927 * the owner of that is passed in so that we can change owners before or after
928 * we switch forks between inodes. The operation that the caller is doing will
929 * determine whether is needs to change owner before or after the switch.
930 *
931 * For demand paged transactional modification, the fork switch should be done
932 * after reading in all the blocks, modifying them and pinning them in the
933 * transaction. For modification when the buffers are already pinned in memory,
934 * the fork switch can be done before changing the owner as we won't need to
935 * validate the owner until the btree buffers are unpinned and writes can occur
936 * again.
937 *
938 * For recovery based ownership change, there is no transactional context and
939 * so a buffer list must be supplied so that we can record the buffers that we
940 * modified for the caller to issue IO on.
941 */
942int
943xfs_bmbt_change_owner(
944 struct xfs_trans *tp,
945 struct xfs_inode *ip,
946 int whichfork,
947 xfs_ino_t new_owner,
948 struct list_head *buffer_list)
949{
950 struct xfs_btree_cur *cur;
951 int error;
952
953 ASSERT(tp || buffer_list);
954 ASSERT(!(tp && buffer_list));
955 if (whichfork == XFS_DATA_FORK)
956 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
957 else
958 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
959
960 cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
961 if (!cur)
962 return -ENOMEM;
963
964 error = xfs_btree_change_owner(cur, new_owner, buffer_list);
965 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
966 return error;
967}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
new file mode 100644
index 000000000000..819a8a4dee95
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -0,0 +1,143 @@
1/*
2 * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_BMAP_BTREE_H__
19#define __XFS_BMAP_BTREE_H__
20
21struct xfs_btree_cur;
22struct xfs_btree_block;
23struct xfs_mount;
24struct xfs_inode;
25struct xfs_trans;
26
27/*
28 * Extent state and extent format macros.
29 */
30#define XFS_EXTFMT_INODE(x) \
31 (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
32 XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
33#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
34
35/*
36 * Btree block header size depends on a superblock flag.
37 */
38#define XFS_BMBT_BLOCK_LEN(mp) \
39 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
40 XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
41
42#define XFS_BMBT_REC_ADDR(mp, block, index) \
43 ((xfs_bmbt_rec_t *) \
44 ((char *)(block) + \
45 XFS_BMBT_BLOCK_LEN(mp) + \
46 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
47
48#define XFS_BMBT_KEY_ADDR(mp, block, index) \
49 ((xfs_bmbt_key_t *) \
50 ((char *)(block) + \
51 XFS_BMBT_BLOCK_LEN(mp) + \
52 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
53
54#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
55 ((xfs_bmbt_ptr_t *) \
56 ((char *)(block) + \
57 XFS_BMBT_BLOCK_LEN(mp) + \
58 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
59 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
60
61#define XFS_BMDR_REC_ADDR(block, index) \
62 ((xfs_bmdr_rec_t *) \
63 ((char *)(block) + \
64 sizeof(struct xfs_bmdr_block) + \
65 ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
66
67#define XFS_BMDR_KEY_ADDR(block, index) \
68 ((xfs_bmdr_key_t *) \
69 ((char *)(block) + \
70 sizeof(struct xfs_bmdr_block) + \
71 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
72
73#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
74 ((xfs_bmdr_ptr_t *) \
75 ((char *)(block) + \
76 sizeof(struct xfs_bmdr_block) + \
77 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
78 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
79
80/*
81 * These are to be used when we know the size of the block and
82 * we don't have a cursor.
83 */
84#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
85 XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
86
87#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
88 (int)(XFS_BMBT_BLOCK_LEN(mp) + \
89 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
90
91#define XFS_BMAP_BROOT_SPACE(mp, bb) \
92 (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
93#define XFS_BMDR_SPACE_CALC(nrecs) \
94 (int)(sizeof(xfs_bmdr_block_t) + \
95 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
96#define XFS_BMAP_BMDR_SPACE(bb) \
97 (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
98
99/*
100 * Maximum number of bmap btree levels.
101 */
102#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)])
103
104/*
105 * Prototypes for xfs_bmap.c to call.
106 */
107extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
108 struct xfs_btree_block *, int);
109extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
110extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
111extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
112extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
113extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
114
115extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
116extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
117
118extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
119extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
120 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
121extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
122extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
123extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
124extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
125
126extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
127 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
128
129extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
130 xfs_bmdr_block_t *, int);
131
132extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
133extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
134extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
135
136extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
137 int whichfork, xfs_ino_t new_owner,
138 struct list_head *buffer_list);
139
140extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
141 struct xfs_trans *, struct xfs_inode *, int);
142
143#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
new file mode 100644
index 000000000000..ba35c9ccb8f9
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -0,0 +1,4069 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_inode.h"
29#include "xfs_trans.h"
30#include "xfs_inode_item.h"
31#include "xfs_buf_item.h"
32#include "xfs_btree.h"
33#include "xfs_error.h"
34#include "xfs_trace.h"
35#include "xfs_cksum.h"
36#include "xfs_alloc.h"
37
38/*
39 * Cursor allocation zone.
40 */
41kmem_zone_t *xfs_btree_cur_zone;
42
43/*
44 * Btree magic numbers.
45 */
46static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
47 { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
48 XFS_FIBT_MAGIC },
49 { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
50 XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
51};
52#define xfs_btree_magic(cur) \
53 xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
54
55
56STATIC int /* error (0 or EFSCORRUPTED) */
57xfs_btree_check_lblock(
58 struct xfs_btree_cur *cur, /* btree cursor */
59 struct xfs_btree_block *block, /* btree long form block pointer */
60 int level, /* level of the btree block */
61 struct xfs_buf *bp) /* buffer for block, if any */
62{
63 int lblock_ok = 1; /* block passes checks */
64 struct xfs_mount *mp; /* file system mount point */
65
66 mp = cur->bc_mp;
67
68 if (xfs_sb_version_hascrc(&mp->m_sb)) {
69 lblock_ok = lblock_ok &&
70 uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
71 block->bb_u.l.bb_blkno == cpu_to_be64(
72 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
73 }
74
75 lblock_ok = lblock_ok &&
76 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
77 be16_to_cpu(block->bb_level) == level &&
78 be16_to_cpu(block->bb_numrecs) <=
79 cur->bc_ops->get_maxrecs(cur, level) &&
80 block->bb_u.l.bb_leftsib &&
81 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
82 XFS_FSB_SANITY_CHECK(mp,
83 be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
84 block->bb_u.l.bb_rightsib &&
85 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
86 XFS_FSB_SANITY_CHECK(mp,
87 be64_to_cpu(block->bb_u.l.bb_rightsib)));
88
89 if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
90 XFS_ERRTAG_BTREE_CHECK_LBLOCK,
91 XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
92 if (bp)
93 trace_xfs_btree_corrupt(bp, _RET_IP_);
94 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
95 return -EFSCORRUPTED;
96 }
97 return 0;
98}
99
100STATIC int /* error (0 or EFSCORRUPTED) */
101xfs_btree_check_sblock(
102 struct xfs_btree_cur *cur, /* btree cursor */
103 struct xfs_btree_block *block, /* btree short form block pointer */
104 int level, /* level of the btree block */
105 struct xfs_buf *bp) /* buffer containing block */
106{
107 struct xfs_mount *mp; /* file system mount point */
108 struct xfs_buf *agbp; /* buffer for ag. freespace struct */
109 struct xfs_agf *agf; /* ag. freespace structure */
110 xfs_agblock_t agflen; /* native ag. freespace length */
111 int sblock_ok = 1; /* block passes checks */
112
113 mp = cur->bc_mp;
114 agbp = cur->bc_private.a.agbp;
115 agf = XFS_BUF_TO_AGF(agbp);
116 agflen = be32_to_cpu(agf->agf_length);
117
118 if (xfs_sb_version_hascrc(&mp->m_sb)) {
119 sblock_ok = sblock_ok &&
120 uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
121 block->bb_u.s.bb_blkno == cpu_to_be64(
122 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
123 }
124
125 sblock_ok = sblock_ok &&
126 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
127 be16_to_cpu(block->bb_level) == level &&
128 be16_to_cpu(block->bb_numrecs) <=
129 cur->bc_ops->get_maxrecs(cur, level) &&
130 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
131 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
132 block->bb_u.s.bb_leftsib &&
133 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
134 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
135 block->bb_u.s.bb_rightsib;
136
137 if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
138 XFS_ERRTAG_BTREE_CHECK_SBLOCK,
139 XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
140 if (bp)
141 trace_xfs_btree_corrupt(bp, _RET_IP_);
142 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
143 return -EFSCORRUPTED;
144 }
145 return 0;
146}
147
148/*
149 * Debug routine: check that block header is ok.
150 */
151int
152xfs_btree_check_block(
153 struct xfs_btree_cur *cur, /* btree cursor */
154 struct xfs_btree_block *block, /* generic btree block pointer */
155 int level, /* level of the btree block */
156 struct xfs_buf *bp) /* buffer containing block, if any */
157{
158 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
159 return xfs_btree_check_lblock(cur, block, level, bp);
160 else
161 return xfs_btree_check_sblock(cur, block, level, bp);
162}
163
164/*
165 * Check that (long) pointer is ok.
166 */
167int /* error (0 or EFSCORRUPTED) */
168xfs_btree_check_lptr(
169 struct xfs_btree_cur *cur, /* btree cursor */
170 xfs_dfsbno_t bno, /* btree block disk address */
171 int level) /* btree block level */
172{
173 XFS_WANT_CORRUPTED_RETURN(
174 level > 0 &&
175 bno != NULLDFSBNO &&
176 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
177 return 0;
178}
179
180#ifdef DEBUG
181/*
182 * Check that (short) pointer is ok.
183 */
184STATIC int /* error (0 or EFSCORRUPTED) */
185xfs_btree_check_sptr(
186 struct xfs_btree_cur *cur, /* btree cursor */
187 xfs_agblock_t bno, /* btree block disk address */
188 int level) /* btree block level */
189{
190 xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks;
191
192 XFS_WANT_CORRUPTED_RETURN(
193 level > 0 &&
194 bno != NULLAGBLOCK &&
195 bno != 0 &&
196 bno < agblocks);
197 return 0;
198}
199
200/*
201 * Check that block ptr is ok.
202 */
203STATIC int /* error (0 or EFSCORRUPTED) */
204xfs_btree_check_ptr(
205 struct xfs_btree_cur *cur, /* btree cursor */
206 union xfs_btree_ptr *ptr, /* btree block disk address */
207 int index, /* offset from ptr to check */
208 int level) /* btree block level */
209{
210 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
211 return xfs_btree_check_lptr(cur,
212 be64_to_cpu((&ptr->l)[index]), level);
213 } else {
214 return xfs_btree_check_sptr(cur,
215 be32_to_cpu((&ptr->s)[index]), level);
216 }
217}
218#endif
219
220/*
221 * Calculate CRC on the whole btree block and stuff it into the
222 * long-form btree header.
223 *
224 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
225 * it into the buffer so recovery knows what the last modifcation was that made
226 * it to disk.
227 */
228void
229xfs_btree_lblock_calc_crc(
230 struct xfs_buf *bp)
231{
232 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
233 struct xfs_buf_log_item *bip = bp->b_fspriv;
234
235 if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
236 return;
237 if (bip)
238 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
239 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
240}
241
242bool
243xfs_btree_lblock_verify_crc(
244 struct xfs_buf *bp)
245{
246 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
247 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
248
249 return true;
250}
251
252/*
253 * Calculate CRC on the whole btree block and stuff it into the
254 * short-form btree header.
255 *
256 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
257 * it into the buffer so recovery knows what the last modifcation was that made
258 * it to disk.
259 */
260void
261xfs_btree_sblock_calc_crc(
262 struct xfs_buf *bp)
263{
264 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
265 struct xfs_buf_log_item *bip = bp->b_fspriv;
266
267 if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
268 return;
269 if (bip)
270 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
271 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
272}
273
274bool
275xfs_btree_sblock_verify_crc(
276 struct xfs_buf *bp)
277{
278 if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
279 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
280
281 return true;
282}
283
284/*
285 * Delete the btree cursor.
286 */
287void
288xfs_btree_del_cursor(
289 xfs_btree_cur_t *cur, /* btree cursor */
290 int error) /* del because of error */
291{
292 int i; /* btree level */
293
294 /*
295 * Clear the buffer pointers, and release the buffers.
296 * If we're doing this in the face of an error, we
297 * need to make sure to inspect all of the entries
298 * in the bc_bufs array for buffers to be unlocked.
299 * This is because some of the btree code works from
300 * level n down to 0, and if we get an error along
301 * the way we won't have initialized all the entries
302 * down to 0.
303 */
304 for (i = 0; i < cur->bc_nlevels; i++) {
305 if (cur->bc_bufs[i])
306 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
307 else if (!error)
308 break;
309 }
310 /*
311 * Can't free a bmap cursor without having dealt with the
312 * allocated indirect blocks' accounting.
313 */
314 ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
315 cur->bc_private.b.allocated == 0);
316 /*
317 * Free the cursor.
318 */
319 kmem_zone_free(xfs_btree_cur_zone, cur);
320}
321
322/*
323 * Duplicate the btree cursor.
324 * Allocate a new one, copy the record, re-get the buffers.
325 */
326int /* error */
327xfs_btree_dup_cursor(
328 xfs_btree_cur_t *cur, /* input cursor */
329 xfs_btree_cur_t **ncur) /* output cursor */
330{
331 xfs_buf_t *bp; /* btree block's buffer pointer */
332 int error; /* error return value */
333 int i; /* level number of btree block */
334 xfs_mount_t *mp; /* mount structure for filesystem */
335 xfs_btree_cur_t *new; /* new cursor value */
336 xfs_trans_t *tp; /* transaction pointer, can be NULL */
337
338 tp = cur->bc_tp;
339 mp = cur->bc_mp;
340
341 /*
342 * Allocate a new cursor like the old one.
343 */
344 new = cur->bc_ops->dup_cursor(cur);
345
346 /*
347 * Copy the record currently in the cursor.
348 */
349 new->bc_rec = cur->bc_rec;
350
351 /*
352 * For each level current, re-get the buffer and copy the ptr value.
353 */
354 for (i = 0; i < new->bc_nlevels; i++) {
355 new->bc_ptrs[i] = cur->bc_ptrs[i];
356 new->bc_ra[i] = cur->bc_ra[i];
357 bp = cur->bc_bufs[i];
358 if (bp) {
359 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
360 XFS_BUF_ADDR(bp), mp->m_bsize,
361 0, &bp,
362 cur->bc_ops->buf_ops);
363 if (error) {
364 xfs_btree_del_cursor(new, error);
365 *ncur = NULL;
366 return error;
367 }
368 }
369 new->bc_bufs[i] = bp;
370 }
371 *ncur = new;
372 return 0;
373}
374
375/*
376 * XFS btree block layout and addressing:
377 *
378 * There are two types of blocks in the btree: leaf and non-leaf blocks.
379 *
380 * The leaf record start with a header then followed by records containing
381 * the values. A non-leaf block also starts with the same header, and
382 * then first contains lookup keys followed by an equal number of pointers
383 * to the btree blocks at the previous level.
384 *
385 * +--------+-------+-------+-------+-------+-------+-------+
386 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
387 * +--------+-------+-------+-------+-------+-------+-------+
388 *
389 * +--------+-------+-------+-------+-------+-------+-------+
390 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
391 * +--------+-------+-------+-------+-------+-------+-------+
392 *
393 * The header is called struct xfs_btree_block for reasons better left unknown
394 * and comes in different versions for short (32bit) and long (64bit) block
395 * pointers. The record and key structures are defined by the btree instances
396 * and opaque to the btree core. The block pointers are simple disk endian
397 * integers, available in a short (32bit) and long (64bit) variant.
398 *
399 * The helpers below calculate the offset of a given record, key or pointer
400 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
401 * record, key or pointer (xfs_btree_*_addr). Note that all addressing
402 * inside the btree block is done using indices starting at one, not zero!
403 */
404
405/*
406 * Return size of the btree block header for this btree instance.
407 */
408static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
409{
410 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
411 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
412 return XFS_BTREE_LBLOCK_CRC_LEN;
413 return XFS_BTREE_LBLOCK_LEN;
414 }
415 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
416 return XFS_BTREE_SBLOCK_CRC_LEN;
417 return XFS_BTREE_SBLOCK_LEN;
418}
419
420/*
421 * Return size of btree block pointers for this btree instance.
422 */
423static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
424{
425 return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
426 sizeof(__be64) : sizeof(__be32);
427}
428
429/*
430 * Calculate offset of the n-th record in a btree block.
431 */
432STATIC size_t
433xfs_btree_rec_offset(
434 struct xfs_btree_cur *cur,
435 int n)
436{
437 return xfs_btree_block_len(cur) +
438 (n - 1) * cur->bc_ops->rec_len;
439}
440
441/*
442 * Calculate offset of the n-th key in a btree block.
443 */
444STATIC size_t
445xfs_btree_key_offset(
446 struct xfs_btree_cur *cur,
447 int n)
448{
449 return xfs_btree_block_len(cur) +
450 (n - 1) * cur->bc_ops->key_len;
451}
452
453/*
454 * Calculate offset of the n-th block pointer in a btree block.
455 */
456STATIC size_t
457xfs_btree_ptr_offset(
458 struct xfs_btree_cur *cur,
459 int n,
460 int level)
461{
462 return xfs_btree_block_len(cur) +
463 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
464 (n - 1) * xfs_btree_ptr_len(cur);
465}
466
467/*
468 * Return a pointer to the n-th record in the btree block.
469 */
470STATIC union xfs_btree_rec *
471xfs_btree_rec_addr(
472 struct xfs_btree_cur *cur,
473 int n,
474 struct xfs_btree_block *block)
475{
476 return (union xfs_btree_rec *)
477 ((char *)block + xfs_btree_rec_offset(cur, n));
478}
479
480/*
481 * Return a pointer to the n-th key in the btree block.
482 */
483STATIC union xfs_btree_key *
484xfs_btree_key_addr(
485 struct xfs_btree_cur *cur,
486 int n,
487 struct xfs_btree_block *block)
488{
489 return (union xfs_btree_key *)
490 ((char *)block + xfs_btree_key_offset(cur, n));
491}
492
493/*
494 * Return a pointer to the n-th block pointer in the btree block.
495 */
496STATIC union xfs_btree_ptr *
497xfs_btree_ptr_addr(
498 struct xfs_btree_cur *cur,
499 int n,
500 struct xfs_btree_block *block)
501{
502 int level = xfs_btree_get_level(block);
503
504 ASSERT(block->bb_level != 0);
505
506 return (union xfs_btree_ptr *)
507 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
508}
509
510/*
511 * Get the root block which is stored in the inode.
512 *
513 * For now this btree implementation assumes the btree root is always
514 * stored in the if_broot field of an inode fork.
515 */
516STATIC struct xfs_btree_block *
517xfs_btree_get_iroot(
518 struct xfs_btree_cur *cur)
519{
520 struct xfs_ifork *ifp;
521
522 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
523 return (struct xfs_btree_block *)ifp->if_broot;
524}
525
526/*
527 * Retrieve the block pointer from the cursor at the given level.
528 * This may be an inode btree root or from a buffer.
529 */
530STATIC struct xfs_btree_block * /* generic btree block pointer */
531xfs_btree_get_block(
532 struct xfs_btree_cur *cur, /* btree cursor */
533 int level, /* level in btree */
534 struct xfs_buf **bpp) /* buffer containing the block */
535{
536 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
537 (level == cur->bc_nlevels - 1)) {
538 *bpp = NULL;
539 return xfs_btree_get_iroot(cur);
540 }
541
542 *bpp = cur->bc_bufs[level];
543 return XFS_BUF_TO_BLOCK(*bpp);
544}
545
546/*
547 * Get a buffer for the block, return it with no data read.
548 * Long-form addressing.
549 */
550xfs_buf_t * /* buffer for fsbno */
551xfs_btree_get_bufl(
552 xfs_mount_t *mp, /* file system mount point */
553 xfs_trans_t *tp, /* transaction pointer */
554 xfs_fsblock_t fsbno, /* file system block number */
555 uint lock) /* lock flags for get_buf */
556{
557 xfs_daddr_t d; /* real disk block address */
558
559 ASSERT(fsbno != NULLFSBLOCK);
560 d = XFS_FSB_TO_DADDR(mp, fsbno);
561 return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
562}
563
564/*
565 * Get a buffer for the block, return it with no data read.
566 * Short-form addressing.
567 */
568xfs_buf_t * /* buffer for agno/agbno */
569xfs_btree_get_bufs(
570 xfs_mount_t *mp, /* file system mount point */
571 xfs_trans_t *tp, /* transaction pointer */
572 xfs_agnumber_t agno, /* allocation group number */
573 xfs_agblock_t agbno, /* allocation group block number */
574 uint lock) /* lock flags for get_buf */
575{
576 xfs_daddr_t d; /* real disk block address */
577
578 ASSERT(agno != NULLAGNUMBER);
579 ASSERT(agbno != NULLAGBLOCK);
580 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
581 return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
582}
583
584/*
585 * Check for the cursor referring to the last block at the given level.
586 */
587int /* 1=is last block, 0=not last block */
588xfs_btree_islastblock(
589 xfs_btree_cur_t *cur, /* btree cursor */
590 int level) /* level to check */
591{
592 struct xfs_btree_block *block; /* generic btree block pointer */
593 xfs_buf_t *bp; /* buffer containing block */
594
595 block = xfs_btree_get_block(cur, level, &bp);
596 xfs_btree_check_block(cur, block, level, bp);
597 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
598 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
599 else
600 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
601}
602
603/*
604 * Change the cursor to point to the first record at the given level.
605 * Other levels are unaffected.
606 */
607STATIC int /* success=1, failure=0 */
608xfs_btree_firstrec(
609 xfs_btree_cur_t *cur, /* btree cursor */
610 int level) /* level to change */
611{
612 struct xfs_btree_block *block; /* generic btree block pointer */
613 xfs_buf_t *bp; /* buffer containing block */
614
615 /*
616 * Get the block pointer for this level.
617 */
618 block = xfs_btree_get_block(cur, level, &bp);
619 xfs_btree_check_block(cur, block, level, bp);
620 /*
621 * It's empty, there is no such record.
622 */
623 if (!block->bb_numrecs)
624 return 0;
625 /*
626 * Set the ptr value to 1, that's the first record/key.
627 */
628 cur->bc_ptrs[level] = 1;
629 return 1;
630}
631
632/*
633 * Change the cursor to point to the last record in the current block
634 * at the given level. Other levels are unaffected.
635 */
636STATIC int /* success=1, failure=0 */
637xfs_btree_lastrec(
638 xfs_btree_cur_t *cur, /* btree cursor */
639 int level) /* level to change */
640{
641 struct xfs_btree_block *block; /* generic btree block pointer */
642 xfs_buf_t *bp; /* buffer containing block */
643
644 /*
645 * Get the block pointer for this level.
646 */
647 block = xfs_btree_get_block(cur, level, &bp);
648 xfs_btree_check_block(cur, block, level, bp);
649 /*
650 * It's empty, there is no such record.
651 */
652 if (!block->bb_numrecs)
653 return 0;
654 /*
655 * Set the ptr value to numrecs, that's the last record/key.
656 */
657 cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
658 return 1;
659}
660
661/*
662 * Compute first and last byte offsets for the fields given.
663 * Interprets the offsets table, which contains struct field offsets.
664 */
665void
666xfs_btree_offsets(
667 __int64_t fields, /* bitmask of fields */
668 const short *offsets, /* table of field offsets */
669 int nbits, /* number of bits to inspect */
670 int *first, /* output: first byte offset */
671 int *last) /* output: last byte offset */
672{
673 int i; /* current bit number */
674 __int64_t imask; /* mask for current bit number */
675
676 ASSERT(fields != 0);
677 /*
678 * Find the lowest bit, so the first byte offset.
679 */
680 for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
681 if (imask & fields) {
682 *first = offsets[i];
683 break;
684 }
685 }
686 /*
687 * Find the highest bit, so the last byte offset.
688 */
689 for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
690 if (imask & fields) {
691 *last = offsets[i + 1] - 1;
692 break;
693 }
694 }
695}
696
697/*
698 * Get a buffer for the block, return it read in.
699 * Long-form addressing.
700 */
701int
702xfs_btree_read_bufl(
703 struct xfs_mount *mp, /* file system mount point */
704 struct xfs_trans *tp, /* transaction pointer */
705 xfs_fsblock_t fsbno, /* file system block number */
706 uint lock, /* lock flags for read_buf */
707 struct xfs_buf **bpp, /* buffer for fsbno */
708 int refval, /* ref count value for buffer */
709 const struct xfs_buf_ops *ops)
710{
711 struct xfs_buf *bp; /* return value */
712 xfs_daddr_t d; /* real disk block address */
713 int error;
714
715 ASSERT(fsbno != NULLFSBLOCK);
716 d = XFS_FSB_TO_DADDR(mp, fsbno);
717 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
718 mp->m_bsize, lock, &bp, ops);
719 if (error)
720 return error;
721 if (bp)
722 xfs_buf_set_ref(bp, refval);
723 *bpp = bp;
724 return 0;
725}
726
727/*
728 * Read-ahead the block, don't wait for it, don't return a buffer.
729 * Long-form addressing.
730 */
731/* ARGSUSED */
732void
733xfs_btree_reada_bufl(
734 struct xfs_mount *mp, /* file system mount point */
735 xfs_fsblock_t fsbno, /* file system block number */
736 xfs_extlen_t count, /* count of filesystem blocks */
737 const struct xfs_buf_ops *ops)
738{
739 xfs_daddr_t d;
740
741 ASSERT(fsbno != NULLFSBLOCK);
742 d = XFS_FSB_TO_DADDR(mp, fsbno);
743 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
744}
745
746/*
747 * Read-ahead the block, don't wait for it, don't return a buffer.
748 * Short-form addressing.
749 */
750/* ARGSUSED */
751void
752xfs_btree_reada_bufs(
753 struct xfs_mount *mp, /* file system mount point */
754 xfs_agnumber_t agno, /* allocation group number */
755 xfs_agblock_t agbno, /* allocation group block number */
756 xfs_extlen_t count, /* count of filesystem blocks */
757 const struct xfs_buf_ops *ops)
758{
759 xfs_daddr_t d;
760
761 ASSERT(agno != NULLAGNUMBER);
762 ASSERT(agbno != NULLAGBLOCK);
763 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
764 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
765}
766
767STATIC int
768xfs_btree_readahead_lblock(
769 struct xfs_btree_cur *cur,
770 int lr,
771 struct xfs_btree_block *block)
772{
773 int rval = 0;
774 xfs_dfsbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
775 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
776
777 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
778 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
779 cur->bc_ops->buf_ops);
780 rval++;
781 }
782
783 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
784 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
785 cur->bc_ops->buf_ops);
786 rval++;
787 }
788
789 return rval;
790}
791
792STATIC int
793xfs_btree_readahead_sblock(
794 struct xfs_btree_cur *cur,
795 int lr,
796 struct xfs_btree_block *block)
797{
798 int rval = 0;
799 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
800 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
801
802
803 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
804 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
805 left, 1, cur->bc_ops->buf_ops);
806 rval++;
807 }
808
809 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
810 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
811 right, 1, cur->bc_ops->buf_ops);
812 rval++;
813 }
814
815 return rval;
816}
817
818/*
819 * Read-ahead btree blocks, at the given level.
820 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
821 */
822STATIC int
823xfs_btree_readahead(
824 struct xfs_btree_cur *cur, /* btree cursor */
825 int lev, /* level in btree */
826 int lr) /* left/right bits */
827{
828 struct xfs_btree_block *block;
829
830 /*
831 * No readahead needed if we are at the root level and the
832 * btree root is stored in the inode.
833 */
834 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
835 (lev == cur->bc_nlevels - 1))
836 return 0;
837
838 if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
839 return 0;
840
841 cur->bc_ra[lev] |= lr;
842 block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
843
844 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
845 return xfs_btree_readahead_lblock(cur, lr, block);
846 return xfs_btree_readahead_sblock(cur, lr, block);
847}
848
849STATIC xfs_daddr_t
850xfs_btree_ptr_to_daddr(
851 struct xfs_btree_cur *cur,
852 union xfs_btree_ptr *ptr)
853{
854 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
855 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
856
857 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
858 } else {
859 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
860 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
861
862 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
863 be32_to_cpu(ptr->s));
864 }
865}
866
867/*
868 * Readahead @count btree blocks at the given @ptr location.
869 *
870 * We don't need to care about long or short form btrees here as we have a
871 * method of converting the ptr directly to a daddr available to us.
872 */
873STATIC void
874xfs_btree_readahead_ptr(
875 struct xfs_btree_cur *cur,
876 union xfs_btree_ptr *ptr,
877 xfs_extlen_t count)
878{
879 xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
880 xfs_btree_ptr_to_daddr(cur, ptr),
881 cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
882}
883
884/*
885 * Set the buffer for level "lev" in the cursor to bp, releasing
886 * any previous buffer.
887 */
888STATIC void
889xfs_btree_setbuf(
890 xfs_btree_cur_t *cur, /* btree cursor */
891 int lev, /* level in btree */
892 xfs_buf_t *bp) /* new buffer to set */
893{
894 struct xfs_btree_block *b; /* btree block */
895
896 if (cur->bc_bufs[lev])
897 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
898 cur->bc_bufs[lev] = bp;
899 cur->bc_ra[lev] = 0;
900
901 b = XFS_BUF_TO_BLOCK(bp);
902 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
903 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
904 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
905 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
906 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
907 } else {
908 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
909 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
910 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
911 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
912 }
913}
914
915STATIC int
916xfs_btree_ptr_is_null(
917 struct xfs_btree_cur *cur,
918 union xfs_btree_ptr *ptr)
919{
920 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
921 return ptr->l == cpu_to_be64(NULLDFSBNO);
922 else
923 return ptr->s == cpu_to_be32(NULLAGBLOCK);
924}
925
926STATIC void
927xfs_btree_set_ptr_null(
928 struct xfs_btree_cur *cur,
929 union xfs_btree_ptr *ptr)
930{
931 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
932 ptr->l = cpu_to_be64(NULLDFSBNO);
933 else
934 ptr->s = cpu_to_be32(NULLAGBLOCK);
935}
936
937/*
938 * Get/set/init sibling pointers
939 */
940STATIC void
941xfs_btree_get_sibling(
942 struct xfs_btree_cur *cur,
943 struct xfs_btree_block *block,
944 union xfs_btree_ptr *ptr,
945 int lr)
946{
947 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
948
949 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
950 if (lr == XFS_BB_RIGHTSIB)
951 ptr->l = block->bb_u.l.bb_rightsib;
952 else
953 ptr->l = block->bb_u.l.bb_leftsib;
954 } else {
955 if (lr == XFS_BB_RIGHTSIB)
956 ptr->s = block->bb_u.s.bb_rightsib;
957 else
958 ptr->s = block->bb_u.s.bb_leftsib;
959 }
960}
961
962STATIC void
963xfs_btree_set_sibling(
964 struct xfs_btree_cur *cur,
965 struct xfs_btree_block *block,
966 union xfs_btree_ptr *ptr,
967 int lr)
968{
969 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
970
971 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
972 if (lr == XFS_BB_RIGHTSIB)
973 block->bb_u.l.bb_rightsib = ptr->l;
974 else
975 block->bb_u.l.bb_leftsib = ptr->l;
976 } else {
977 if (lr == XFS_BB_RIGHTSIB)
978 block->bb_u.s.bb_rightsib = ptr->s;
979 else
980 block->bb_u.s.bb_leftsib = ptr->s;
981 }
982}
983
984void
985xfs_btree_init_block_int(
986 struct xfs_mount *mp,
987 struct xfs_btree_block *buf,
988 xfs_daddr_t blkno,
989 __u32 magic,
990 __u16 level,
991 __u16 numrecs,
992 __u64 owner,
993 unsigned int flags)
994{
995 buf->bb_magic = cpu_to_be32(magic);
996 buf->bb_level = cpu_to_be16(level);
997 buf->bb_numrecs = cpu_to_be16(numrecs);
998
999 if (flags & XFS_BTREE_LONG_PTRS) {
1000 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
1001 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
1002 if (flags & XFS_BTREE_CRC_BLOCKS) {
1003 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1004 buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1005 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
1006 buf->bb_u.l.bb_pad = 0;
1007 buf->bb_u.l.bb_lsn = 0;
1008 }
1009 } else {
1010 /* owner is a 32 bit value on short blocks */
1011 __u32 __owner = (__u32)owner;
1012
1013 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1014 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1015 if (flags & XFS_BTREE_CRC_BLOCKS) {
1016 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1017 buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1018 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
1019 buf->bb_u.s.bb_lsn = 0;
1020 }
1021 }
1022}
1023
1024void
1025xfs_btree_init_block(
1026 struct xfs_mount *mp,
1027 struct xfs_buf *bp,
1028 __u32 magic,
1029 __u16 level,
1030 __u16 numrecs,
1031 __u64 owner,
1032 unsigned int flags)
1033{
1034 xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1035 magic, level, numrecs, owner, flags);
1036}
1037
1038STATIC void
1039xfs_btree_init_block_cur(
1040 struct xfs_btree_cur *cur,
1041 struct xfs_buf *bp,
1042 int level,
1043 int numrecs)
1044{
1045 __u64 owner;
1046
1047 /*
1048 * we can pull the owner from the cursor right now as the different
1049 * owners align directly with the pointer size of the btree. This may
1050 * change in future, but is safe for current users of the generic btree
1051 * code.
1052 */
1053 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1054 owner = cur->bc_private.b.ip->i_ino;
1055 else
1056 owner = cur->bc_private.a.agno;
1057
1058 xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1059 xfs_btree_magic(cur), level, numrecs,
1060 owner, cur->bc_flags);
1061}
1062
1063/*
1064 * Return true if ptr is the last record in the btree and
1065 * we need to track updates to this record. The decision
1066 * will be further refined in the update_lastrec method.
1067 */
1068STATIC int
1069xfs_btree_is_lastrec(
1070 struct xfs_btree_cur *cur,
1071 struct xfs_btree_block *block,
1072 int level)
1073{
1074 union xfs_btree_ptr ptr;
1075
1076 if (level > 0)
1077 return 0;
1078 if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1079 return 0;
1080
1081 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1082 if (!xfs_btree_ptr_is_null(cur, &ptr))
1083 return 0;
1084 return 1;
1085}
1086
1087STATIC void
1088xfs_btree_buf_to_ptr(
1089 struct xfs_btree_cur *cur,
1090 struct xfs_buf *bp,
1091 union xfs_btree_ptr *ptr)
1092{
1093 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1094 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1095 XFS_BUF_ADDR(bp)));
1096 else {
1097 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1098 XFS_BUF_ADDR(bp)));
1099 }
1100}
1101
1102STATIC void
1103xfs_btree_set_refs(
1104 struct xfs_btree_cur *cur,
1105 struct xfs_buf *bp)
1106{
1107 switch (cur->bc_btnum) {
1108 case XFS_BTNUM_BNO:
1109 case XFS_BTNUM_CNT:
1110 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1111 break;
1112 case XFS_BTNUM_INO:
1113 case XFS_BTNUM_FINO:
1114 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1115 break;
1116 case XFS_BTNUM_BMAP:
1117 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1118 break;
1119 default:
1120 ASSERT(0);
1121 }
1122}
1123
1124STATIC int
1125xfs_btree_get_buf_block(
1126 struct xfs_btree_cur *cur,
1127 union xfs_btree_ptr *ptr,
1128 int flags,
1129 struct xfs_btree_block **block,
1130 struct xfs_buf **bpp)
1131{
1132 struct xfs_mount *mp = cur->bc_mp;
1133 xfs_daddr_t d;
1134
1135 /* need to sort out how callers deal with failures first */
1136 ASSERT(!(flags & XBF_TRYLOCK));
1137
1138 d = xfs_btree_ptr_to_daddr(cur, ptr);
1139 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1140 mp->m_bsize, flags);
1141
1142 if (!*bpp)
1143 return -ENOMEM;
1144
1145 (*bpp)->b_ops = cur->bc_ops->buf_ops;
1146 *block = XFS_BUF_TO_BLOCK(*bpp);
1147 return 0;
1148}
1149
1150/*
1151 * Read in the buffer at the given ptr and return the buffer and
1152 * the block pointer within the buffer.
1153 */
1154STATIC int
1155xfs_btree_read_buf_block(
1156 struct xfs_btree_cur *cur,
1157 union xfs_btree_ptr *ptr,
1158 int flags,
1159 struct xfs_btree_block **block,
1160 struct xfs_buf **bpp)
1161{
1162 struct xfs_mount *mp = cur->bc_mp;
1163 xfs_daddr_t d;
1164 int error;
1165
1166 /* need to sort out how callers deal with failures first */
1167 ASSERT(!(flags & XBF_TRYLOCK));
1168
1169 d = xfs_btree_ptr_to_daddr(cur, ptr);
1170 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1171 mp->m_bsize, flags, bpp,
1172 cur->bc_ops->buf_ops);
1173 if (error)
1174 return error;
1175
1176 xfs_btree_set_refs(cur, *bpp);
1177 *block = XFS_BUF_TO_BLOCK(*bpp);
1178 return 0;
1179}
1180
1181/*
1182 * Copy keys from one btree block to another.
1183 */
1184STATIC void
1185xfs_btree_copy_keys(
1186 struct xfs_btree_cur *cur,
1187 union xfs_btree_key *dst_key,
1188 union xfs_btree_key *src_key,
1189 int numkeys)
1190{
1191 ASSERT(numkeys >= 0);
1192 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1193}
1194
1195/*
1196 * Copy records from one btree block to another.
1197 */
1198STATIC void
1199xfs_btree_copy_recs(
1200 struct xfs_btree_cur *cur,
1201 union xfs_btree_rec *dst_rec,
1202 union xfs_btree_rec *src_rec,
1203 int numrecs)
1204{
1205 ASSERT(numrecs >= 0);
1206 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1207}
1208
1209/*
1210 * Copy block pointers from one btree block to another.
1211 */
1212STATIC void
1213xfs_btree_copy_ptrs(
1214 struct xfs_btree_cur *cur,
1215 union xfs_btree_ptr *dst_ptr,
1216 union xfs_btree_ptr *src_ptr,
1217 int numptrs)
1218{
1219 ASSERT(numptrs >= 0);
1220 memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1221}
1222
1223/*
1224 * Shift keys one index left/right inside a single btree block.
1225 */
1226STATIC void
1227xfs_btree_shift_keys(
1228 struct xfs_btree_cur *cur,
1229 union xfs_btree_key *key,
1230 int dir,
1231 int numkeys)
1232{
1233 char *dst_key;
1234
1235 ASSERT(numkeys >= 0);
1236 ASSERT(dir == 1 || dir == -1);
1237
1238 dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1239 memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1240}
1241
1242/*
1243 * Shift records one index left/right inside a single btree block.
1244 */
1245STATIC void
1246xfs_btree_shift_recs(
1247 struct xfs_btree_cur *cur,
1248 union xfs_btree_rec *rec,
1249 int dir,
1250 int numrecs)
1251{
1252 char *dst_rec;
1253
1254 ASSERT(numrecs >= 0);
1255 ASSERT(dir == 1 || dir == -1);
1256
1257 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1258 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1259}
1260
1261/*
1262 * Shift block pointers one index left/right inside a single btree block.
1263 */
1264STATIC void
1265xfs_btree_shift_ptrs(
1266 struct xfs_btree_cur *cur,
1267 union xfs_btree_ptr *ptr,
1268 int dir,
1269 int numptrs)
1270{
1271 char *dst_ptr;
1272
1273 ASSERT(numptrs >= 0);
1274 ASSERT(dir == 1 || dir == -1);
1275
1276 dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1277 memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1278}
1279
1280/*
1281 * Log key values from the btree block.
1282 */
1283STATIC void
1284xfs_btree_log_keys(
1285 struct xfs_btree_cur *cur,
1286 struct xfs_buf *bp,
1287 int first,
1288 int last)
1289{
1290 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1291 XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1292
1293 if (bp) {
1294 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1295 xfs_trans_log_buf(cur->bc_tp, bp,
1296 xfs_btree_key_offset(cur, first),
1297 xfs_btree_key_offset(cur, last + 1) - 1);
1298 } else {
1299 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1300 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1301 }
1302
1303 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1304}
1305
1306/*
1307 * Log record values from the btree block.
1308 */
1309void
1310xfs_btree_log_recs(
1311 struct xfs_btree_cur *cur,
1312 struct xfs_buf *bp,
1313 int first,
1314 int last)
1315{
1316 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1317 XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1318
1319 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1320 xfs_trans_log_buf(cur->bc_tp, bp,
1321 xfs_btree_rec_offset(cur, first),
1322 xfs_btree_rec_offset(cur, last + 1) - 1);
1323
1324 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1325}
1326
1327/*
1328 * Log block pointer fields from a btree block (nonleaf).
1329 */
1330STATIC void
1331xfs_btree_log_ptrs(
1332 struct xfs_btree_cur *cur, /* btree cursor */
1333 struct xfs_buf *bp, /* buffer containing btree block */
1334 int first, /* index of first pointer to log */
1335 int last) /* index of last pointer to log */
1336{
1337 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1338 XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1339
1340 if (bp) {
1341 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
1342 int level = xfs_btree_get_level(block);
1343
1344 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1345 xfs_trans_log_buf(cur->bc_tp, bp,
1346 xfs_btree_ptr_offset(cur, first, level),
1347 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1348 } else {
1349 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1350 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1351 }
1352
1353 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1354}
1355
1356/*
1357 * Log fields from a btree block header.
1358 */
1359void
1360xfs_btree_log_block(
1361 struct xfs_btree_cur *cur, /* btree cursor */
1362 struct xfs_buf *bp, /* buffer containing btree block */
1363 int fields) /* mask of fields: XFS_BB_... */
1364{
1365 int first; /* first byte offset logged */
1366 int last; /* last byte offset logged */
1367 static const short soffsets[] = { /* table of offsets (short) */
1368 offsetof(struct xfs_btree_block, bb_magic),
1369 offsetof(struct xfs_btree_block, bb_level),
1370 offsetof(struct xfs_btree_block, bb_numrecs),
1371 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1372 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1373 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1374 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1375 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1376 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1377 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1378 XFS_BTREE_SBLOCK_CRC_LEN
1379 };
1380 static const short loffsets[] = { /* table of offsets (long) */
1381 offsetof(struct xfs_btree_block, bb_magic),
1382 offsetof(struct xfs_btree_block, bb_level),
1383 offsetof(struct xfs_btree_block, bb_numrecs),
1384 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1385 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1386 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1387 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1388 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1389 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1390 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1391 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1392 XFS_BTREE_LBLOCK_CRC_LEN
1393 };
1394
1395 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1396 XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1397
1398 if (bp) {
1399 int nbits;
1400
1401 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1402 /*
1403 * We don't log the CRC when updating a btree
1404 * block but instead recreate it during log
1405 * recovery. As the log buffers have checksums
1406 * of their own this is safe and avoids logging a crc
1407 * update in a lot of places.
1408 */
1409 if (fields == XFS_BB_ALL_BITS)
1410 fields = XFS_BB_ALL_BITS_CRC;
1411 nbits = XFS_BB_NUM_BITS_CRC;
1412 } else {
1413 nbits = XFS_BB_NUM_BITS;
1414 }
1415 xfs_btree_offsets(fields,
1416 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1417 loffsets : soffsets,
1418 nbits, &first, &last);
1419 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1420 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1421 } else {
1422 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1423 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1424 }
1425
1426 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1427}
1428
1429/*
1430 * Increment cursor by one record at the level.
1431 * For nonzero levels the leaf-ward information is untouched.
1432 */
1433int /* error */
1434xfs_btree_increment(
1435 struct xfs_btree_cur *cur,
1436 int level,
1437 int *stat) /* success/failure */
1438{
1439 struct xfs_btree_block *block;
1440 union xfs_btree_ptr ptr;
1441 struct xfs_buf *bp;
1442 int error; /* error return value */
1443 int lev;
1444
1445 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1446 XFS_BTREE_TRACE_ARGI(cur, level);
1447
1448 ASSERT(level < cur->bc_nlevels);
1449
1450 /* Read-ahead to the right at this level. */
1451 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1452
1453 /* Get a pointer to the btree block. */
1454 block = xfs_btree_get_block(cur, level, &bp);
1455
1456#ifdef DEBUG
1457 error = xfs_btree_check_block(cur, block, level, bp);
1458 if (error)
1459 goto error0;
1460#endif
1461
1462 /* We're done if we remain in the block after the increment. */
1463 if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1464 goto out1;
1465
1466 /* Fail if we just went off the right edge of the tree. */
1467 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1468 if (xfs_btree_ptr_is_null(cur, &ptr))
1469 goto out0;
1470
1471 XFS_BTREE_STATS_INC(cur, increment);
1472
1473 /*
1474 * March up the tree incrementing pointers.
1475 * Stop when we don't go off the right edge of a block.
1476 */
1477 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1478 block = xfs_btree_get_block(cur, lev, &bp);
1479
1480#ifdef DEBUG
1481 error = xfs_btree_check_block(cur, block, lev, bp);
1482 if (error)
1483 goto error0;
1484#endif
1485
1486 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1487 break;
1488
1489 /* Read-ahead the right block for the next loop. */
1490 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1491 }
1492
1493 /*
1494 * If we went off the root then we are either seriously
1495 * confused or have the tree root in an inode.
1496 */
1497 if (lev == cur->bc_nlevels) {
1498 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1499 goto out0;
1500 ASSERT(0);
1501 error = -EFSCORRUPTED;
1502 goto error0;
1503 }
1504 ASSERT(lev < cur->bc_nlevels);
1505
1506 /*
1507 * Now walk back down the tree, fixing up the cursor's buffer
1508 * pointers and key numbers.
1509 */
1510 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1511 union xfs_btree_ptr *ptrp;
1512
1513 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1514 --lev;
1515 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1516 if (error)
1517 goto error0;
1518
1519 xfs_btree_setbuf(cur, lev, bp);
1520 cur->bc_ptrs[lev] = 1;
1521 }
1522out1:
1523 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1524 *stat = 1;
1525 return 0;
1526
1527out0:
1528 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1529 *stat = 0;
1530 return 0;
1531
1532error0:
1533 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1534 return error;
1535}
1536
1537/*
1538 * Decrement cursor by one record at the level.
1539 * For nonzero levels the leaf-ward information is untouched.
1540 */
1541int /* error */
1542xfs_btree_decrement(
1543 struct xfs_btree_cur *cur,
1544 int level,
1545 int *stat) /* success/failure */
1546{
1547 struct xfs_btree_block *block;
1548 xfs_buf_t *bp;
1549 int error; /* error return value */
1550 int lev;
1551 union xfs_btree_ptr ptr;
1552
1553 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1554 XFS_BTREE_TRACE_ARGI(cur, level);
1555
1556 ASSERT(level < cur->bc_nlevels);
1557
1558 /* Read-ahead to the left at this level. */
1559 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1560
1561 /* We're done if we remain in the block after the decrement. */
1562 if (--cur->bc_ptrs[level] > 0)
1563 goto out1;
1564
1565 /* Get a pointer to the btree block. */
1566 block = xfs_btree_get_block(cur, level, &bp);
1567
1568#ifdef DEBUG
1569 error = xfs_btree_check_block(cur, block, level, bp);
1570 if (error)
1571 goto error0;
1572#endif
1573
1574 /* Fail if we just went off the left edge of the tree. */
1575 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1576 if (xfs_btree_ptr_is_null(cur, &ptr))
1577 goto out0;
1578
1579 XFS_BTREE_STATS_INC(cur, decrement);
1580
1581 /*
1582 * March up the tree decrementing pointers.
1583 * Stop when we don't go off the left edge of a block.
1584 */
1585 for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1586 if (--cur->bc_ptrs[lev] > 0)
1587 break;
1588 /* Read-ahead the left block for the next loop. */
1589 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1590 }
1591
1592 /*
1593 * If we went off the root then we are seriously confused.
1594 * or the root of the tree is in an inode.
1595 */
1596 if (lev == cur->bc_nlevels) {
1597 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1598 goto out0;
1599 ASSERT(0);
1600 error = -EFSCORRUPTED;
1601 goto error0;
1602 }
1603 ASSERT(lev < cur->bc_nlevels);
1604
1605 /*
1606 * Now walk back down the tree, fixing up the cursor's buffer
1607 * pointers and key numbers.
1608 */
1609 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1610 union xfs_btree_ptr *ptrp;
1611
1612 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1613 --lev;
1614 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1615 if (error)
1616 goto error0;
1617 xfs_btree_setbuf(cur, lev, bp);
1618 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1619 }
1620out1:
1621 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1622 *stat = 1;
1623 return 0;
1624
1625out0:
1626 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1627 *stat = 0;
1628 return 0;
1629
1630error0:
1631 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1632 return error;
1633}
1634
1635STATIC int
1636xfs_btree_lookup_get_block(
1637 struct xfs_btree_cur *cur, /* btree cursor */
1638 int level, /* level in the btree */
1639 union xfs_btree_ptr *pp, /* ptr to btree block */
1640 struct xfs_btree_block **blkp) /* return btree block */
1641{
1642 struct xfs_buf *bp; /* buffer pointer for btree block */
1643 int error = 0;
1644
1645 /* special case the root block if in an inode */
1646 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1647 (level == cur->bc_nlevels - 1)) {
1648 *blkp = xfs_btree_get_iroot(cur);
1649 return 0;
1650 }
1651
1652 /*
1653 * If the old buffer at this level for the disk address we are
1654 * looking for re-use it.
1655 *
1656 * Otherwise throw it away and get a new one.
1657 */
1658 bp = cur->bc_bufs[level];
1659 if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1660 *blkp = XFS_BUF_TO_BLOCK(bp);
1661 return 0;
1662 }
1663
1664 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1665 if (error)
1666 return error;
1667
1668 xfs_btree_setbuf(cur, level, bp);
1669 return 0;
1670}
1671
1672/*
1673 * Get current search key. For level 0 we don't actually have a key
1674 * structure so we make one up from the record. For all other levels
1675 * we just return the right key.
1676 */
1677STATIC union xfs_btree_key *
1678xfs_lookup_get_search_key(
1679 struct xfs_btree_cur *cur,
1680 int level,
1681 int keyno,
1682 struct xfs_btree_block *block,
1683 union xfs_btree_key *kp)
1684{
1685 if (level == 0) {
1686 cur->bc_ops->init_key_from_rec(kp,
1687 xfs_btree_rec_addr(cur, keyno, block));
1688 return kp;
1689 }
1690
1691 return xfs_btree_key_addr(cur, keyno, block);
1692}
1693
1694/*
1695 * Lookup the record. The cursor is made to point to it, based on dir.
1696 * stat is set to 0 if can't find any such record, 1 for success.
1697 */
1698int /* error */
1699xfs_btree_lookup(
1700 struct xfs_btree_cur *cur, /* btree cursor */
1701 xfs_lookup_t dir, /* <=, ==, or >= */
1702 int *stat) /* success/failure */
1703{
1704 struct xfs_btree_block *block; /* current btree block */
1705 __int64_t diff; /* difference for the current key */
1706 int error; /* error return value */
1707 int keyno; /* current key number */
1708 int level; /* level in the btree */
1709 union xfs_btree_ptr *pp; /* ptr to btree block */
1710 union xfs_btree_ptr ptr; /* ptr to btree block */
1711
1712 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1713 XFS_BTREE_TRACE_ARGI(cur, dir);
1714
1715 XFS_BTREE_STATS_INC(cur, lookup);
1716
1717 block = NULL;
1718 keyno = 0;
1719
1720 /* initialise start pointer from cursor */
1721 cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1722 pp = &ptr;
1723
1724 /*
1725 * Iterate over each level in the btree, starting at the root.
1726 * For each level above the leaves, find the key we need, based
1727 * on the lookup record, then follow the corresponding block
1728 * pointer down to the next level.
1729 */
1730 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1731 /* Get the block we need to do the lookup on. */
1732 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1733 if (error)
1734 goto error0;
1735
1736 if (diff == 0) {
1737 /*
1738 * If we already had a key match at a higher level, we
1739 * know we need to use the first entry in this block.
1740 */
1741 keyno = 1;
1742 } else {
1743 /* Otherwise search this block. Do a binary search. */
1744
1745 int high; /* high entry number */
1746 int low; /* low entry number */
1747
1748 /* Set low and high entry numbers, 1-based. */
1749 low = 1;
1750 high = xfs_btree_get_numrecs(block);
1751 if (!high) {
1752 /* Block is empty, must be an empty leaf. */
1753 ASSERT(level == 0 && cur->bc_nlevels == 1);
1754
1755 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1756 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1757 *stat = 0;
1758 return 0;
1759 }
1760
1761 /* Binary search the block. */
1762 while (low <= high) {
1763 union xfs_btree_key key;
1764 union xfs_btree_key *kp;
1765
1766 XFS_BTREE_STATS_INC(cur, compare);
1767
1768 /* keyno is average of low and high. */
1769 keyno = (low + high) >> 1;
1770
1771 /* Get current search key */
1772 kp = xfs_lookup_get_search_key(cur, level,
1773 keyno, block, &key);
1774
1775 /*
1776 * Compute difference to get next direction:
1777 * - less than, move right
1778 * - greater than, move left
1779 * - equal, we're done
1780 */
1781 diff = cur->bc_ops->key_diff(cur, kp);
1782 if (diff < 0)
1783 low = keyno + 1;
1784 else if (diff > 0)
1785 high = keyno - 1;
1786 else
1787 break;
1788 }
1789 }
1790
1791 /*
1792 * If there are more levels, set up for the next level
1793 * by getting the block number and filling in the cursor.
1794 */
1795 if (level > 0) {
1796 /*
1797 * If we moved left, need the previous key number,
1798 * unless there isn't one.
1799 */
1800 if (diff > 0 && --keyno < 1)
1801 keyno = 1;
1802 pp = xfs_btree_ptr_addr(cur, keyno, block);
1803
1804#ifdef DEBUG
1805 error = xfs_btree_check_ptr(cur, pp, 0, level);
1806 if (error)
1807 goto error0;
1808#endif
1809 cur->bc_ptrs[level] = keyno;
1810 }
1811 }
1812
1813 /* Done with the search. See if we need to adjust the results. */
1814 if (dir != XFS_LOOKUP_LE && diff < 0) {
1815 keyno++;
1816 /*
1817 * If ge search and we went off the end of the block, but it's
1818 * not the last block, we're in the wrong block.
1819 */
1820 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1821 if (dir == XFS_LOOKUP_GE &&
1822 keyno > xfs_btree_get_numrecs(block) &&
1823 !xfs_btree_ptr_is_null(cur, &ptr)) {
1824 int i;
1825
1826 cur->bc_ptrs[0] = keyno;
1827 error = xfs_btree_increment(cur, 0, &i);
1828 if (error)
1829 goto error0;
1830 XFS_WANT_CORRUPTED_RETURN(i == 1);
1831 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1832 *stat = 1;
1833 return 0;
1834 }
1835 } else if (dir == XFS_LOOKUP_LE && diff > 0)
1836 keyno--;
1837 cur->bc_ptrs[0] = keyno;
1838
1839 /* Return if we succeeded or not. */
1840 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1841 *stat = 0;
1842 else if (dir != XFS_LOOKUP_EQ || diff == 0)
1843 *stat = 1;
1844 else
1845 *stat = 0;
1846 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1847 return 0;
1848
1849error0:
1850 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1851 return error;
1852}
1853
1854/*
1855 * Update keys at all levels from here to the root along the cursor's path.
1856 */
1857STATIC int
1858xfs_btree_updkey(
1859 struct xfs_btree_cur *cur,
1860 union xfs_btree_key *keyp,
1861 int level)
1862{
1863 struct xfs_btree_block *block;
1864 struct xfs_buf *bp;
1865 union xfs_btree_key *kp;
1866 int ptr;
1867
1868 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1869 XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1870
1871 ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1872
1873 /*
1874 * Go up the tree from this level toward the root.
1875 * At each level, update the key value to the value input.
1876 * Stop when we reach a level where the cursor isn't pointing
1877 * at the first entry in the block.
1878 */
1879 for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1880#ifdef DEBUG
1881 int error;
1882#endif
1883 block = xfs_btree_get_block(cur, level, &bp);
1884#ifdef DEBUG
1885 error = xfs_btree_check_block(cur, block, level, bp);
1886 if (error) {
1887 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1888 return error;
1889 }
1890#endif
1891 ptr = cur->bc_ptrs[level];
1892 kp = xfs_btree_key_addr(cur, ptr, block);
1893 xfs_btree_copy_keys(cur, kp, keyp, 1);
1894 xfs_btree_log_keys(cur, bp, ptr, ptr);
1895 }
1896
1897 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1898 return 0;
1899}
1900
1901/*
1902 * Update the record referred to by cur to the value in the
1903 * given record. This either works (return 0) or gets an
1904 * EFSCORRUPTED error.
1905 */
1906int
1907xfs_btree_update(
1908 struct xfs_btree_cur *cur,
1909 union xfs_btree_rec *rec)
1910{
1911 struct xfs_btree_block *block;
1912 struct xfs_buf *bp;
1913 int error;
1914 int ptr;
1915 union xfs_btree_rec *rp;
1916
1917 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1918 XFS_BTREE_TRACE_ARGR(cur, rec);
1919
1920 /* Pick up the current block. */
1921 block = xfs_btree_get_block(cur, 0, &bp);
1922
1923#ifdef DEBUG
1924 error = xfs_btree_check_block(cur, block, 0, bp);
1925 if (error)
1926 goto error0;
1927#endif
1928 /* Get the address of the rec to be updated. */
1929 ptr = cur->bc_ptrs[0];
1930 rp = xfs_btree_rec_addr(cur, ptr, block);
1931
1932 /* Fill in the new contents and log them. */
1933 xfs_btree_copy_recs(cur, rp, rec, 1);
1934 xfs_btree_log_recs(cur, bp, ptr, ptr);
1935
1936 /*
1937 * If we are tracking the last record in the tree and
1938 * we are at the far right edge of the tree, update it.
1939 */
1940 if (xfs_btree_is_lastrec(cur, block, 0)) {
1941 cur->bc_ops->update_lastrec(cur, block, rec,
1942 ptr, LASTREC_UPDATE);
1943 }
1944
1945 /* Updating first rec in leaf. Pass new key value up to our parent. */
1946 if (ptr == 1) {
1947 union xfs_btree_key key;
1948
1949 cur->bc_ops->init_key_from_rec(&key, rec);
1950 error = xfs_btree_updkey(cur, &key, 1);
1951 if (error)
1952 goto error0;
1953 }
1954
1955 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1956 return 0;
1957
1958error0:
1959 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1960 return error;
1961}
1962
1963/*
1964 * Move 1 record left from cur/level if possible.
1965 * Update cur to reflect the new path.
1966 */
1967STATIC int /* error */
1968xfs_btree_lshift(
1969 struct xfs_btree_cur *cur,
1970 int level,
1971 int *stat) /* success/failure */
1972{
1973 union xfs_btree_key key; /* btree key */
1974 struct xfs_buf *lbp; /* left buffer pointer */
1975 struct xfs_btree_block *left; /* left btree block */
1976 int lrecs; /* left record count */
1977 struct xfs_buf *rbp; /* right buffer pointer */
1978 struct xfs_btree_block *right; /* right btree block */
1979 int rrecs; /* right record count */
1980 union xfs_btree_ptr lptr; /* left btree pointer */
1981 union xfs_btree_key *rkp = NULL; /* right btree key */
1982 union xfs_btree_ptr *rpp = NULL; /* right address pointer */
1983 union xfs_btree_rec *rrp = NULL; /* right record pointer */
1984 int error; /* error return value */
1985
1986 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1987 XFS_BTREE_TRACE_ARGI(cur, level);
1988
1989 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1990 level == cur->bc_nlevels - 1)
1991 goto out0;
1992
1993 /* Set up variables for this block as "right". */
1994 right = xfs_btree_get_block(cur, level, &rbp);
1995
1996#ifdef DEBUG
1997 error = xfs_btree_check_block(cur, right, level, rbp);
1998 if (error)
1999 goto error0;
2000#endif
2001
2002 /* If we've got no left sibling then we can't shift an entry left. */
2003 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2004 if (xfs_btree_ptr_is_null(cur, &lptr))
2005 goto out0;
2006
2007 /*
2008 * If the cursor entry is the one that would be moved, don't
2009 * do it... it's too complicated.
2010 */
2011 if (cur->bc_ptrs[level] <= 1)
2012 goto out0;
2013
2014 /* Set up the left neighbor as "left". */
2015 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2016 if (error)
2017 goto error0;
2018
2019 /* If it's full, it can't take another entry. */
2020 lrecs = xfs_btree_get_numrecs(left);
2021 if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2022 goto out0;
2023
2024 rrecs = xfs_btree_get_numrecs(right);
2025
2026 /*
2027 * We add one entry to the left side and remove one for the right side.
2028 * Account for it here, the changes will be updated on disk and logged
2029 * later.
2030 */
2031 lrecs++;
2032 rrecs--;
2033
2034 XFS_BTREE_STATS_INC(cur, lshift);
2035 XFS_BTREE_STATS_ADD(cur, moves, 1);
2036
2037 /*
2038 * If non-leaf, copy a key and a ptr to the left block.
2039 * Log the changes to the left block.
2040 */
2041 if (level > 0) {
2042 /* It's a non-leaf. Move keys and pointers. */
2043 union xfs_btree_key *lkp; /* left btree key */
2044 union xfs_btree_ptr *lpp; /* left address pointer */
2045
2046 lkp = xfs_btree_key_addr(cur, lrecs, left);
2047 rkp = xfs_btree_key_addr(cur, 1, right);
2048
2049 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2050 rpp = xfs_btree_ptr_addr(cur, 1, right);
2051#ifdef DEBUG
2052 error = xfs_btree_check_ptr(cur, rpp, 0, level);
2053 if (error)
2054 goto error0;
2055#endif
2056 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2057 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2058
2059 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2060 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2061
2062 ASSERT(cur->bc_ops->keys_inorder(cur,
2063 xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2064 } else {
2065 /* It's a leaf. Move records. */
2066 union xfs_btree_rec *lrp; /* left record pointer */
2067
2068 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2069 rrp = xfs_btree_rec_addr(cur, 1, right);
2070
2071 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2072 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2073
2074 ASSERT(cur->bc_ops->recs_inorder(cur,
2075 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2076 }
2077
2078 xfs_btree_set_numrecs(left, lrecs);
2079 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2080
2081 xfs_btree_set_numrecs(right, rrecs);
2082 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2083
2084 /*
2085 * Slide the contents of right down one entry.
2086 */
2087 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2088 if (level > 0) {
2089 /* It's a nonleaf. operate on keys and ptrs */
2090#ifdef DEBUG
2091 int i; /* loop index */
2092
2093 for (i = 0; i < rrecs; i++) {
2094 error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2095 if (error)
2096 goto error0;
2097 }
2098#endif
2099 xfs_btree_shift_keys(cur,
2100 xfs_btree_key_addr(cur, 2, right),
2101 -1, rrecs);
2102 xfs_btree_shift_ptrs(cur,
2103 xfs_btree_ptr_addr(cur, 2, right),
2104 -1, rrecs);
2105
2106 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2107 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2108 } else {
2109 /* It's a leaf. operate on records */
2110 xfs_btree_shift_recs(cur,
2111 xfs_btree_rec_addr(cur, 2, right),
2112 -1, rrecs);
2113 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2114
2115 /*
2116 * If it's the first record in the block, we'll need a key
2117 * structure to pass up to the next level (updkey).
2118 */
2119 cur->bc_ops->init_key_from_rec(&key,
2120 xfs_btree_rec_addr(cur, 1, right));
2121 rkp = &key;
2122 }
2123
2124 /* Update the parent key values of right. */
2125 error = xfs_btree_updkey(cur, rkp, level + 1);
2126 if (error)
2127 goto error0;
2128
2129 /* Slide the cursor value left one. */
2130 cur->bc_ptrs[level]--;
2131
2132 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2133 *stat = 1;
2134 return 0;
2135
2136out0:
2137 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2138 *stat = 0;
2139 return 0;
2140
2141error0:
2142 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2143 return error;
2144}
2145
2146/*
2147 * Move 1 record right from cur/level if possible.
2148 * Update cur to reflect the new path.
2149 */
2150STATIC int /* error */
2151xfs_btree_rshift(
2152 struct xfs_btree_cur *cur,
2153 int level,
2154 int *stat) /* success/failure */
2155{
2156 union xfs_btree_key key; /* btree key */
2157 struct xfs_buf *lbp; /* left buffer pointer */
2158 struct xfs_btree_block *left; /* left btree block */
2159 struct xfs_buf *rbp; /* right buffer pointer */
2160 struct xfs_btree_block *right; /* right btree block */
2161 struct xfs_btree_cur *tcur; /* temporary btree cursor */
2162 union xfs_btree_ptr rptr; /* right block pointer */
2163 union xfs_btree_key *rkp; /* right btree key */
2164 int rrecs; /* right record count */
2165 int lrecs; /* left record count */
2166 int error; /* error return value */
2167 int i; /* loop counter */
2168
2169 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2170 XFS_BTREE_TRACE_ARGI(cur, level);
2171
2172 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2173 (level == cur->bc_nlevels - 1))
2174 goto out0;
2175
2176 /* Set up variables for this block as "left". */
2177 left = xfs_btree_get_block(cur, level, &lbp);
2178
2179#ifdef DEBUG
2180 error = xfs_btree_check_block(cur, left, level, lbp);
2181 if (error)
2182 goto error0;
2183#endif
2184
2185 /* If we've got no right sibling then we can't shift an entry right. */
2186 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2187 if (xfs_btree_ptr_is_null(cur, &rptr))
2188 goto out0;
2189
2190 /*
2191 * If the cursor entry is the one that would be moved, don't
2192 * do it... it's too complicated.
2193 */
2194 lrecs = xfs_btree_get_numrecs(left);
2195 if (cur->bc_ptrs[level] >= lrecs)
2196 goto out0;
2197
2198 /* Set up the right neighbor as "right". */
2199 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2200 if (error)
2201 goto error0;
2202
2203 /* If it's full, it can't take another entry. */
2204 rrecs = xfs_btree_get_numrecs(right);
2205 if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2206 goto out0;
2207
2208 XFS_BTREE_STATS_INC(cur, rshift);
2209 XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2210
2211 /*
2212 * Make a hole at the start of the right neighbor block, then
2213 * copy the last left block entry to the hole.
2214 */
2215 if (level > 0) {
2216 /* It's a nonleaf. make a hole in the keys and ptrs */
2217 union xfs_btree_key *lkp;
2218 union xfs_btree_ptr *lpp;
2219 union xfs_btree_ptr *rpp;
2220
2221 lkp = xfs_btree_key_addr(cur, lrecs, left);
2222 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2223 rkp = xfs_btree_key_addr(cur, 1, right);
2224 rpp = xfs_btree_ptr_addr(cur, 1, right);
2225
2226#ifdef DEBUG
2227 for (i = rrecs - 1; i >= 0; i--) {
2228 error = xfs_btree_check_ptr(cur, rpp, i, level);
2229 if (error)
2230 goto error0;
2231 }
2232#endif
2233
2234 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2235 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2236
2237#ifdef DEBUG
2238 error = xfs_btree_check_ptr(cur, lpp, 0, level);
2239 if (error)
2240 goto error0;
2241#endif
2242
2243 /* Now put the new data in, and log it. */
2244 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2245 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2246
2247 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2248 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2249
2250 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2251 xfs_btree_key_addr(cur, 2, right)));
2252 } else {
2253 /* It's a leaf. make a hole in the records */
2254 union xfs_btree_rec *lrp;
2255 union xfs_btree_rec *rrp;
2256
2257 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2258 rrp = xfs_btree_rec_addr(cur, 1, right);
2259
2260 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2261
2262 /* Now put the new data in, and log it. */
2263 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2264 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2265
2266 cur->bc_ops->init_key_from_rec(&key, rrp);
2267 rkp = &key;
2268
2269 ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2270 xfs_btree_rec_addr(cur, 2, right)));
2271 }
2272
2273 /*
2274 * Decrement and log left's numrecs, bump and log right's numrecs.
2275 */
2276 xfs_btree_set_numrecs(left, --lrecs);
2277 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2278
2279 xfs_btree_set_numrecs(right, ++rrecs);
2280 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2281
2282 /*
2283 * Using a temporary cursor, update the parent key values of the
2284 * block on the right.
2285 */
2286 error = xfs_btree_dup_cursor(cur, &tcur);
2287 if (error)
2288 goto error0;
2289 i = xfs_btree_lastrec(tcur, level);
2290 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
2291
2292 error = xfs_btree_increment(tcur, level, &i);
2293 if (error)
2294 goto error1;
2295
2296 error = xfs_btree_updkey(tcur, rkp, level + 1);
2297 if (error)
2298 goto error1;
2299
2300 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2301
2302 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2303 *stat = 1;
2304 return 0;
2305
2306out0:
2307 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2308 *stat = 0;
2309 return 0;
2310
2311error0:
2312 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2313 return error;
2314
2315error1:
2316 XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2317 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2318 return error;
2319}
2320
2321/*
2322 * Split cur/level block in half.
2323 * Return new block number and the key to its first
2324 * record (to be inserted into parent).
2325 */
2326STATIC int /* error */
2327__xfs_btree_split(
2328 struct xfs_btree_cur *cur,
2329 int level,
2330 union xfs_btree_ptr *ptrp,
2331 union xfs_btree_key *key,
2332 struct xfs_btree_cur **curp,
2333 int *stat) /* success/failure */
2334{
2335 union xfs_btree_ptr lptr; /* left sibling block ptr */
2336 struct xfs_buf *lbp; /* left buffer pointer */
2337 struct xfs_btree_block *left; /* left btree block */
2338 union xfs_btree_ptr rptr; /* right sibling block ptr */
2339 struct xfs_buf *rbp; /* right buffer pointer */
2340 struct xfs_btree_block *right; /* right btree block */
2341 union xfs_btree_ptr rrptr; /* right-right sibling ptr */
2342 struct xfs_buf *rrbp; /* right-right buffer pointer */
2343 struct xfs_btree_block *rrblock; /* right-right btree block */
2344 int lrecs;
2345 int rrecs;
2346 int src_index;
2347 int error; /* error return value */
2348#ifdef DEBUG
2349 int i;
2350#endif
2351
2352 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2353 XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2354
2355 XFS_BTREE_STATS_INC(cur, split);
2356
2357 /* Set up left block (current one). */
2358 left = xfs_btree_get_block(cur, level, &lbp);
2359
2360#ifdef DEBUG
2361 error = xfs_btree_check_block(cur, left, level, lbp);
2362 if (error)
2363 goto error0;
2364#endif
2365
2366 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2367
2368 /* Allocate the new block. If we can't do it, we're toast. Give up. */
2369 error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2370 if (error)
2371 goto error0;
2372 if (*stat == 0)
2373 goto out0;
2374 XFS_BTREE_STATS_INC(cur, alloc);
2375
2376 /* Set up the new block as "right". */
2377 error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2378 if (error)
2379 goto error0;
2380
2381 /* Fill in the btree header for the new right block. */
2382 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2383
2384 /*
2385 * Split the entries between the old and the new block evenly.
2386 * Make sure that if there's an odd number of entries now, that
2387 * each new block will have the same number of entries.
2388 */
2389 lrecs = xfs_btree_get_numrecs(left);
2390 rrecs = lrecs / 2;
2391 if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2392 rrecs++;
2393 src_index = (lrecs - rrecs + 1);
2394
2395 XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2396
2397 /*
2398 * Copy btree block entries from the left block over to the
2399 * new block, the right. Update the right block and log the
2400 * changes.
2401 */
2402 if (level > 0) {
2403 /* It's a non-leaf. Move keys and pointers. */
2404 union xfs_btree_key *lkp; /* left btree key */
2405 union xfs_btree_ptr *lpp; /* left address pointer */
2406 union xfs_btree_key *rkp; /* right btree key */
2407 union xfs_btree_ptr *rpp; /* right address pointer */
2408
2409 lkp = xfs_btree_key_addr(cur, src_index, left);
2410 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2411 rkp = xfs_btree_key_addr(cur, 1, right);
2412 rpp = xfs_btree_ptr_addr(cur, 1, right);
2413
2414#ifdef DEBUG
2415 for (i = src_index; i < rrecs; i++) {
2416 error = xfs_btree_check_ptr(cur, lpp, i, level);
2417 if (error)
2418 goto error0;
2419 }
2420#endif
2421
2422 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2423 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2424
2425 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2426 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2427
2428 /* Grab the keys to the entries moved to the right block */
2429 xfs_btree_copy_keys(cur, key, rkp, 1);
2430 } else {
2431 /* It's a leaf. Move records. */
2432 union xfs_btree_rec *lrp; /* left record pointer */
2433 union xfs_btree_rec *rrp; /* right record pointer */
2434
2435 lrp = xfs_btree_rec_addr(cur, src_index, left);
2436 rrp = xfs_btree_rec_addr(cur, 1, right);
2437
2438 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2439 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2440
2441 cur->bc_ops->init_key_from_rec(key,
2442 xfs_btree_rec_addr(cur, 1, right));
2443 }
2444
2445
2446 /*
2447 * Find the left block number by looking in the buffer.
2448 * Adjust numrecs, sibling pointers.
2449 */
2450 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2451 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2452 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2453 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2454
2455 lrecs -= rrecs;
2456 xfs_btree_set_numrecs(left, lrecs);
2457 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2458
2459 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2460 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2461
2462 /*
2463 * If there's a block to the new block's right, make that block
2464 * point back to right instead of to left.
2465 */
2466 if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2467 error = xfs_btree_read_buf_block(cur, &rrptr,
2468 0, &rrblock, &rrbp);
2469 if (error)
2470 goto error0;
2471 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2472 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2473 }
2474 /*
2475 * If the cursor is really in the right block, move it there.
2476 * If it's just pointing past the last entry in left, then we'll
2477 * insert there, so don't change anything in that case.
2478 */
2479 if (cur->bc_ptrs[level] > lrecs + 1) {
2480 xfs_btree_setbuf(cur, level, rbp);
2481 cur->bc_ptrs[level] -= lrecs;
2482 }
2483 /*
2484 * If there are more levels, we'll need another cursor which refers
2485 * the right block, no matter where this cursor was.
2486 */
2487 if (level + 1 < cur->bc_nlevels) {
2488 error = xfs_btree_dup_cursor(cur, curp);
2489 if (error)
2490 goto error0;
2491 (*curp)->bc_ptrs[level + 1]++;
2492 }
2493 *ptrp = rptr;
2494 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2495 *stat = 1;
2496 return 0;
2497out0:
2498 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2499 *stat = 0;
2500 return 0;
2501
2502error0:
2503 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2504 return error;
2505}
2506
2507struct xfs_btree_split_args {
2508 struct xfs_btree_cur *cur;
2509 int level;
2510 union xfs_btree_ptr *ptrp;
2511 union xfs_btree_key *key;
2512 struct xfs_btree_cur **curp;
2513 int *stat; /* success/failure */
2514 int result;
2515 bool kswapd; /* allocation in kswapd context */
2516 struct completion *done;
2517 struct work_struct work;
2518};
2519
2520/*
2521 * Stack switching interfaces for allocation
2522 */
2523static void
2524xfs_btree_split_worker(
2525 struct work_struct *work)
2526{
2527 struct xfs_btree_split_args *args = container_of(work,
2528 struct xfs_btree_split_args, work);
2529 unsigned long pflags;
2530 unsigned long new_pflags = PF_FSTRANS;
2531
2532 /*
2533 * we are in a transaction context here, but may also be doing work
2534 * in kswapd context, and hence we may need to inherit that state
2535 * temporarily to ensure that we don't block waiting for memory reclaim
2536 * in any way.
2537 */
2538 if (args->kswapd)
2539 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2540
2541 current_set_flags_nested(&pflags, new_pflags);
2542
2543 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2544 args->key, args->curp, args->stat);
2545 complete(args->done);
2546
2547 current_restore_flags_nested(&pflags, new_pflags);
2548}
2549
2550/*
2551 * BMBT split requests often come in with little stack to work on. Push
2552 * them off to a worker thread so there is lots of stack to use. For the other
2553 * btree types, just call directly to avoid the context switch overhead here.
2554 */
2555STATIC int /* error */
2556xfs_btree_split(
2557 struct xfs_btree_cur *cur,
2558 int level,
2559 union xfs_btree_ptr *ptrp,
2560 union xfs_btree_key *key,
2561 struct xfs_btree_cur **curp,
2562 int *stat) /* success/failure */
2563{
2564 struct xfs_btree_split_args args;
2565 DECLARE_COMPLETION_ONSTACK(done);
2566
2567 if (cur->bc_btnum != XFS_BTNUM_BMAP)
2568 return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2569
2570 args.cur = cur;
2571 args.level = level;
2572 args.ptrp = ptrp;
2573 args.key = key;
2574 args.curp = curp;
2575 args.stat = stat;
2576 args.done = &done;
2577 args.kswapd = current_is_kswapd();
2578 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2579 queue_work(xfs_alloc_wq, &args.work);
2580 wait_for_completion(&done);
2581 destroy_work_on_stack(&args.work);
2582 return args.result;
2583}
2584
2585
2586/*
2587 * Copy the old inode root contents into a real block and make the
2588 * broot point to it.
2589 */
2590int /* error */
2591xfs_btree_new_iroot(
2592 struct xfs_btree_cur *cur, /* btree cursor */
2593 int *logflags, /* logging flags for inode */
2594 int *stat) /* return status - 0 fail */
2595{
2596 struct xfs_buf *cbp; /* buffer for cblock */
2597 struct xfs_btree_block *block; /* btree block */
2598 struct xfs_btree_block *cblock; /* child btree block */
2599 union xfs_btree_key *ckp; /* child key pointer */
2600 union xfs_btree_ptr *cpp; /* child ptr pointer */
2601 union xfs_btree_key *kp; /* pointer to btree key */
2602 union xfs_btree_ptr *pp; /* pointer to block addr */
2603 union xfs_btree_ptr nptr; /* new block addr */
2604 int level; /* btree level */
2605 int error; /* error return code */
2606#ifdef DEBUG
2607 int i; /* loop counter */
2608#endif
2609
2610 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2611 XFS_BTREE_STATS_INC(cur, newroot);
2612
2613 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2614
2615 level = cur->bc_nlevels - 1;
2616
2617 block = xfs_btree_get_iroot(cur);
2618 pp = xfs_btree_ptr_addr(cur, 1, block);
2619
2620 /* Allocate the new block. If we can't do it, we're toast. Give up. */
2621 error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2622 if (error)
2623 goto error0;
2624 if (*stat == 0) {
2625 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2626 return 0;
2627 }
2628 XFS_BTREE_STATS_INC(cur, alloc);
2629
2630 /* Copy the root into a real block. */
2631 error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2632 if (error)
2633 goto error0;
2634
2635 /*
2636 * we can't just memcpy() the root in for CRC enabled btree blocks.
2637 * In that case have to also ensure the blkno remains correct
2638 */
2639 memcpy(cblock, block, xfs_btree_block_len(cur));
2640 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2641 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2642 cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2643 else
2644 cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2645 }
2646
2647 be16_add_cpu(&block->bb_level, 1);
2648 xfs_btree_set_numrecs(block, 1);
2649 cur->bc_nlevels++;
2650 cur->bc_ptrs[level + 1] = 1;
2651
2652 kp = xfs_btree_key_addr(cur, 1, block);
2653 ckp = xfs_btree_key_addr(cur, 1, cblock);
2654 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2655
2656 cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2657#ifdef DEBUG
2658 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2659 error = xfs_btree_check_ptr(cur, pp, i, level);
2660 if (error)
2661 goto error0;
2662 }
2663#endif
2664 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2665
2666#ifdef DEBUG
2667 error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2668 if (error)
2669 goto error0;
2670#endif
2671 xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2672
2673 xfs_iroot_realloc(cur->bc_private.b.ip,
2674 1 - xfs_btree_get_numrecs(cblock),
2675 cur->bc_private.b.whichfork);
2676
2677 xfs_btree_setbuf(cur, level, cbp);
2678
2679 /*
2680 * Do all this logging at the end so that
2681 * the root is at the right level.
2682 */
2683 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2684 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2685 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2686
2687 *logflags |=
2688 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2689 *stat = 1;
2690 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2691 return 0;
2692error0:
2693 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2694 return error;
2695}
2696
2697/*
2698 * Allocate a new root block, fill it in.
2699 */
2700STATIC int /* error */
2701xfs_btree_new_root(
2702 struct xfs_btree_cur *cur, /* btree cursor */
2703 int *stat) /* success/failure */
2704{
2705 struct xfs_btree_block *block; /* one half of the old root block */
2706 struct xfs_buf *bp; /* buffer containing block */
2707 int error; /* error return value */
2708 struct xfs_buf *lbp; /* left buffer pointer */
2709 struct xfs_btree_block *left; /* left btree block */
2710 struct xfs_buf *nbp; /* new (root) buffer */
2711 struct xfs_btree_block *new; /* new (root) btree block */
2712 int nptr; /* new value for key index, 1 or 2 */
2713 struct xfs_buf *rbp; /* right buffer pointer */
2714 struct xfs_btree_block *right; /* right btree block */
2715 union xfs_btree_ptr rptr;
2716 union xfs_btree_ptr lptr;
2717
2718 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2719 XFS_BTREE_STATS_INC(cur, newroot);
2720
2721 /* initialise our start point from the cursor */
2722 cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2723
2724 /* Allocate the new block. If we can't do it, we're toast. Give up. */
2725 error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
2726 if (error)
2727 goto error0;
2728 if (*stat == 0)
2729 goto out0;
2730 XFS_BTREE_STATS_INC(cur, alloc);
2731
2732 /* Set up the new block. */
2733 error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2734 if (error)
2735 goto error0;
2736
2737 /* Set the root in the holding structure increasing the level by 1. */
2738 cur->bc_ops->set_root(cur, &lptr, 1);
2739
2740 /*
2741 * At the previous root level there are now two blocks: the old root,
2742 * and the new block generated when it was split. We don't know which
2743 * one the cursor is pointing at, so we set up variables "left" and
2744 * "right" for each case.
2745 */
2746 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2747
2748#ifdef DEBUG
2749 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2750 if (error)
2751 goto error0;
2752#endif
2753
2754 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2755 if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2756 /* Our block is left, pick up the right block. */
2757 lbp = bp;
2758 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2759 left = block;
2760 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2761 if (error)
2762 goto error0;
2763 bp = rbp;
2764 nptr = 1;
2765 } else {
2766 /* Our block is right, pick up the left block. */
2767 rbp = bp;
2768 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2769 right = block;
2770 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2771 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2772 if (error)
2773 goto error0;
2774 bp = lbp;
2775 nptr = 2;
2776 }
2777 /* Fill in the new block's btree header and log it. */
2778 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
2779 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2780 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2781 !xfs_btree_ptr_is_null(cur, &rptr));
2782
2783 /* Fill in the key data in the new root. */
2784 if (xfs_btree_get_level(left) > 0) {
2785 xfs_btree_copy_keys(cur,
2786 xfs_btree_key_addr(cur, 1, new),
2787 xfs_btree_key_addr(cur, 1, left), 1);
2788 xfs_btree_copy_keys(cur,
2789 xfs_btree_key_addr(cur, 2, new),
2790 xfs_btree_key_addr(cur, 1, right), 1);
2791 } else {
2792 cur->bc_ops->init_key_from_rec(
2793 xfs_btree_key_addr(cur, 1, new),
2794 xfs_btree_rec_addr(cur, 1, left));
2795 cur->bc_ops->init_key_from_rec(
2796 xfs_btree_key_addr(cur, 2, new),
2797 xfs_btree_rec_addr(cur, 1, right));
2798 }
2799 xfs_btree_log_keys(cur, nbp, 1, 2);
2800
2801 /* Fill in the pointer data in the new root. */
2802 xfs_btree_copy_ptrs(cur,
2803 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2804 xfs_btree_copy_ptrs(cur,
2805 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2806 xfs_btree_log_ptrs(cur, nbp, 1, 2);
2807
2808 /* Fix up the cursor. */
2809 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2810 cur->bc_ptrs[cur->bc_nlevels] = nptr;
2811 cur->bc_nlevels++;
2812 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2813 *stat = 1;
2814 return 0;
2815error0:
2816 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2817 return error;
2818out0:
2819 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2820 *stat = 0;
2821 return 0;
2822}
2823
2824STATIC int
2825xfs_btree_make_block_unfull(
2826 struct xfs_btree_cur *cur, /* btree cursor */
2827 int level, /* btree level */
2828 int numrecs,/* # of recs in block */
2829 int *oindex,/* old tree index */
2830 int *index, /* new tree index */
2831 union xfs_btree_ptr *nptr, /* new btree ptr */
2832 struct xfs_btree_cur **ncur, /* new btree cursor */
2833 union xfs_btree_rec *nrec, /* new record */
2834 int *stat)
2835{
2836 union xfs_btree_key key; /* new btree key value */
2837 int error = 0;
2838
2839 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2840 level == cur->bc_nlevels - 1) {
2841 struct xfs_inode *ip = cur->bc_private.b.ip;
2842
2843 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2844 /* A root block that can be made bigger. */
2845 xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2846 } else {
2847 /* A root block that needs replacing */
2848 int logflags = 0;
2849
2850 error = xfs_btree_new_iroot(cur, &logflags, stat);
2851 if (error || *stat == 0)
2852 return error;
2853
2854 xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2855 }
2856
2857 return 0;
2858 }
2859
2860 /* First, try shifting an entry to the right neighbor. */
2861 error = xfs_btree_rshift(cur, level, stat);
2862 if (error || *stat)
2863 return error;
2864
2865 /* Next, try shifting an entry to the left neighbor. */
2866 error = xfs_btree_lshift(cur, level, stat);
2867 if (error)
2868 return error;
2869
2870 if (*stat) {
2871 *oindex = *index = cur->bc_ptrs[level];
2872 return 0;
2873 }
2874
2875 /*
2876 * Next, try splitting the current block in half.
2877 *
2878 * If this works we have to re-set our variables because we
2879 * could be in a different block now.
2880 */
2881 error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2882 if (error || *stat == 0)
2883 return error;
2884
2885
2886 *index = cur->bc_ptrs[level];
2887 cur->bc_ops->init_rec_from_key(&key, nrec);
2888 return 0;
2889}
2890
2891/*
2892 * Insert one record/level. Return information to the caller
2893 * allowing the next level up to proceed if necessary.
2894 */
2895STATIC int
2896xfs_btree_insrec(
2897 struct xfs_btree_cur *cur, /* btree cursor */
2898 int level, /* level to insert record at */
2899 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */
2900 union xfs_btree_rec *recp, /* i/o: record data inserted */
2901 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */
2902 int *stat) /* success/failure */
2903{
2904 struct xfs_btree_block *block; /* btree block */
2905 struct xfs_buf *bp; /* buffer for block */
2906 union xfs_btree_key key; /* btree key */
2907 union xfs_btree_ptr nptr; /* new block ptr */
2908 struct xfs_btree_cur *ncur; /* new btree cursor */
2909 union xfs_btree_rec nrec; /* new record count */
2910 int optr; /* old key/record index */
2911 int ptr; /* key/record index */
2912 int numrecs;/* number of records */
2913 int error; /* error return value */
2914#ifdef DEBUG
2915 int i;
2916#endif
2917
2918 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2919 XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2920
2921 ncur = NULL;
2922
2923 /*
2924 * If we have an external root pointer, and we've made it to the
2925 * root level, allocate a new root block and we're done.
2926 */
2927 if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2928 (level >= cur->bc_nlevels)) {
2929 error = xfs_btree_new_root(cur, stat);
2930 xfs_btree_set_ptr_null(cur, ptrp);
2931
2932 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2933 return error;
2934 }
2935
2936 /* If we're off the left edge, return failure. */
2937 ptr = cur->bc_ptrs[level];
2938 if (ptr == 0) {
2939 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2940 *stat = 0;
2941 return 0;
2942 }
2943
2944 /* Make a key out of the record data to be inserted, and save it. */
2945 cur->bc_ops->init_key_from_rec(&key, recp);
2946
2947 optr = ptr;
2948
2949 XFS_BTREE_STATS_INC(cur, insrec);
2950
2951 /* Get pointers to the btree buffer and block. */
2952 block = xfs_btree_get_block(cur, level, &bp);
2953 numrecs = xfs_btree_get_numrecs(block);
2954
2955#ifdef DEBUG
2956 error = xfs_btree_check_block(cur, block, level, bp);
2957 if (error)
2958 goto error0;
2959
2960 /* Check that the new entry is being inserted in the right place. */
2961 if (ptr <= numrecs) {
2962 if (level == 0) {
2963 ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2964 xfs_btree_rec_addr(cur, ptr, block)));
2965 } else {
2966 ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2967 xfs_btree_key_addr(cur, ptr, block)));
2968 }
2969 }
2970#endif
2971
2972 /*
2973 * If the block is full, we can't insert the new entry until we
2974 * make the block un-full.
2975 */
2976 xfs_btree_set_ptr_null(cur, &nptr);
2977 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
2978 error = xfs_btree_make_block_unfull(cur, level, numrecs,
2979 &optr, &ptr, &nptr, &ncur, &nrec, stat);
2980 if (error || *stat == 0)
2981 goto error0;
2982 }
2983
2984 /*
2985 * The current block may have changed if the block was
2986 * previously full and we have just made space in it.
2987 */
2988 block = xfs_btree_get_block(cur, level, &bp);
2989 numrecs = xfs_btree_get_numrecs(block);
2990
2991#ifdef DEBUG
2992 error = xfs_btree_check_block(cur, block, level, bp);
2993 if (error)
2994 return error;
2995#endif
2996
2997 /*
2998 * At this point we know there's room for our new entry in the block
2999 * we're pointing at.
3000 */
3001 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3002
3003 if (level > 0) {
3004 /* It's a nonleaf. make a hole in the keys and ptrs */
3005 union xfs_btree_key *kp;
3006 union xfs_btree_ptr *pp;
3007
3008 kp = xfs_btree_key_addr(cur, ptr, block);
3009 pp = xfs_btree_ptr_addr(cur, ptr, block);
3010
3011#ifdef DEBUG
3012 for (i = numrecs - ptr; i >= 0; i--) {
3013 error = xfs_btree_check_ptr(cur, pp, i, level);
3014 if (error)
3015 return error;
3016 }
3017#endif
3018
3019 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3020 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3021
3022#ifdef DEBUG
3023 error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3024 if (error)
3025 goto error0;
3026#endif
3027
3028 /* Now put the new data in, bump numrecs and log it. */
3029 xfs_btree_copy_keys(cur, kp, &key, 1);
3030 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3031 numrecs++;
3032 xfs_btree_set_numrecs(block, numrecs);
3033 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3034 xfs_btree_log_keys(cur, bp, ptr, numrecs);
3035#ifdef DEBUG
3036 if (ptr < numrecs) {
3037 ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3038 xfs_btree_key_addr(cur, ptr + 1, block)));
3039 }
3040#endif
3041 } else {
3042 /* It's a leaf. make a hole in the records */
3043 union xfs_btree_rec *rp;
3044
3045 rp = xfs_btree_rec_addr(cur, ptr, block);
3046
3047 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3048
3049 /* Now put the new data in, bump numrecs and log it. */
3050 xfs_btree_copy_recs(cur, rp, recp, 1);
3051 xfs_btree_set_numrecs(block, ++numrecs);
3052 xfs_btree_log_recs(cur, bp, ptr, numrecs);
3053#ifdef DEBUG
3054 if (ptr < numrecs) {
3055 ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3056 xfs_btree_rec_addr(cur, ptr + 1, block)));
3057 }
3058#endif
3059 }
3060
3061 /* Log the new number of records in the btree header. */
3062 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3063
3064 /* If we inserted at the start of a block, update the parents' keys. */
3065 if (optr == 1) {
3066 error = xfs_btree_updkey(cur, &key, level + 1);
3067 if (error)
3068 goto error0;
3069 }
3070
3071 /*
3072 * If we are tracking the last record in the tree and
3073 * we are at the far right edge of the tree, update it.
3074 */
3075 if (xfs_btree_is_lastrec(cur, block, level)) {
3076 cur->bc_ops->update_lastrec(cur, block, recp,
3077 ptr, LASTREC_INSREC);
3078 }
3079
3080 /*
3081 * Return the new block number, if any.
3082 * If there is one, give back a record value and a cursor too.
3083 */
3084 *ptrp = nptr;
3085 if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3086 *recp = nrec;
3087 *curp = ncur;
3088 }
3089
3090 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3091 *stat = 1;
3092 return 0;
3093
3094error0:
3095 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3096 return error;
3097}
3098
3099/*
3100 * Insert the record at the point referenced by cur.
3101 *
3102 * A multi-level split of the tree on insert will invalidate the original
3103 * cursor. All callers of this function should assume that the cursor is
3104 * no longer valid and revalidate it.
3105 */
3106int
3107xfs_btree_insert(
3108 struct xfs_btree_cur *cur,
3109 int *stat)
3110{
3111 int error; /* error return value */
3112 int i; /* result value, 0 for failure */
3113 int level; /* current level number in btree */
3114 union xfs_btree_ptr nptr; /* new block number (split result) */
3115 struct xfs_btree_cur *ncur; /* new cursor (split result) */
3116 struct xfs_btree_cur *pcur; /* previous level's cursor */
3117 union xfs_btree_rec rec; /* record to insert */
3118
3119 level = 0;
3120 ncur = NULL;
3121 pcur = cur;
3122
3123 xfs_btree_set_ptr_null(cur, &nptr);
3124 cur->bc_ops->init_rec_from_cur(cur, &rec);
3125
3126 /*
3127 * Loop going up the tree, starting at the leaf level.
3128 * Stop when we don't get a split block, that must mean that
3129 * the insert is finished with this level.
3130 */
3131 do {
3132 /*
3133 * Insert nrec/nptr into this level of the tree.
3134 * Note if we fail, nptr will be null.
3135 */
3136 error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
3137 if (error) {
3138 if (pcur != cur)
3139 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3140 goto error0;
3141 }
3142
3143 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3144 level++;
3145
3146 /*
3147 * See if the cursor we just used is trash.
3148 * Can't trash the caller's cursor, but otherwise we should
3149 * if ncur is a new cursor or we're about to be done.
3150 */
3151 if (pcur != cur &&
3152 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3153 /* Save the state from the cursor before we trash it */
3154 if (cur->bc_ops->update_cursor)
3155 cur->bc_ops->update_cursor(pcur, cur);
3156 cur->bc_nlevels = pcur->bc_nlevels;
3157 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3158 }
3159 /* If we got a new cursor, switch to it. */
3160 if (ncur) {
3161 pcur = ncur;
3162 ncur = NULL;
3163 }
3164 } while (!xfs_btree_ptr_is_null(cur, &nptr));
3165
3166 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3167 *stat = i;
3168 return 0;
3169error0:
3170 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3171 return error;
3172}
3173
3174/*
3175 * Try to merge a non-leaf block back into the inode root.
3176 *
3177 * Note: the killroot names comes from the fact that we're effectively
3178 * killing the old root block. But because we can't just delete the
3179 * inode we have to copy the single block it was pointing to into the
3180 * inode.
3181 */
3182STATIC int
3183xfs_btree_kill_iroot(
3184 struct xfs_btree_cur *cur)
3185{
3186 int whichfork = cur->bc_private.b.whichfork;
3187 struct xfs_inode *ip = cur->bc_private.b.ip;
3188 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
3189 struct xfs_btree_block *block;
3190 struct xfs_btree_block *cblock;
3191 union xfs_btree_key *kp;
3192 union xfs_btree_key *ckp;
3193 union xfs_btree_ptr *pp;
3194 union xfs_btree_ptr *cpp;
3195 struct xfs_buf *cbp;
3196 int level;
3197 int index;
3198 int numrecs;
3199#ifdef DEBUG
3200 union xfs_btree_ptr ptr;
3201 int i;
3202#endif
3203
3204 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3205
3206 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3207 ASSERT(cur->bc_nlevels > 1);
3208
3209 /*
3210 * Don't deal with the root block needs to be a leaf case.
3211 * We're just going to turn the thing back into extents anyway.
3212 */
3213 level = cur->bc_nlevels - 1;
3214 if (level == 1)
3215 goto out0;
3216
3217 /*
3218 * Give up if the root has multiple children.
3219 */
3220 block = xfs_btree_get_iroot(cur);
3221 if (xfs_btree_get_numrecs(block) != 1)
3222 goto out0;
3223
3224 cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3225 numrecs = xfs_btree_get_numrecs(cblock);
3226
3227 /*
3228 * Only do this if the next level will fit.
3229 * Then the data must be copied up to the inode,
3230 * instead of freeing the root you free the next level.
3231 */
3232 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3233 goto out0;
3234
3235 XFS_BTREE_STATS_INC(cur, killroot);
3236
3237#ifdef DEBUG
3238 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3239 ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3240 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3241 ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3242#endif
3243
3244 index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3245 if (index) {
3246 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3247 cur->bc_private.b.whichfork);
3248 block = ifp->if_broot;
3249 }
3250
3251 be16_add_cpu(&block->bb_numrecs, index);
3252 ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3253
3254 kp = xfs_btree_key_addr(cur, 1, block);
3255 ckp = xfs_btree_key_addr(cur, 1, cblock);
3256 xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3257
3258 pp = xfs_btree_ptr_addr(cur, 1, block);
3259 cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3260#ifdef DEBUG
3261 for (i = 0; i < numrecs; i++) {
3262 int error;
3263
3264 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3265 if (error) {
3266 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3267 return error;
3268 }
3269 }
3270#endif
3271 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3272
3273 cur->bc_ops->free_block(cur, cbp);
3274 XFS_BTREE_STATS_INC(cur, free);
3275
3276 cur->bc_bufs[level - 1] = NULL;
3277 be16_add_cpu(&block->bb_level, -1);
3278 xfs_trans_log_inode(cur->bc_tp, ip,
3279 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3280 cur->bc_nlevels--;
3281out0:
3282 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3283 return 0;
3284}
3285
3286/*
3287 * Kill the current root node, and replace it with it's only child node.
3288 */
3289STATIC int
3290xfs_btree_kill_root(
3291 struct xfs_btree_cur *cur,
3292 struct xfs_buf *bp,
3293 int level,
3294 union xfs_btree_ptr *newroot)
3295{
3296 int error;
3297
3298 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3299 XFS_BTREE_STATS_INC(cur, killroot);
3300
3301 /*
3302 * Update the root pointer, decreasing the level by 1 and then
3303 * free the old root.
3304 */
3305 cur->bc_ops->set_root(cur, newroot, -1);
3306
3307 error = cur->bc_ops->free_block(cur, bp);
3308 if (error) {
3309 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3310 return error;
3311 }
3312
3313 XFS_BTREE_STATS_INC(cur, free);
3314
3315 cur->bc_bufs[level] = NULL;
3316 cur->bc_ra[level] = 0;
3317 cur->bc_nlevels--;
3318
3319 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3320 return 0;
3321}
3322
3323STATIC int
3324xfs_btree_dec_cursor(
3325 struct xfs_btree_cur *cur,
3326 int level,
3327 int *stat)
3328{
3329 int error;
3330 int i;
3331
3332 if (level > 0) {
3333 error = xfs_btree_decrement(cur, level, &i);
3334 if (error)
3335 return error;
3336 }
3337
3338 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3339 *stat = 1;
3340 return 0;
3341}
3342
3343/*
3344 * Single level of the btree record deletion routine.
3345 * Delete record pointed to by cur/level.
3346 * Remove the record from its block then rebalance the tree.
3347 * Return 0 for error, 1 for done, 2 to go on to the next level.
3348 */
3349STATIC int /* error */
3350xfs_btree_delrec(
3351 struct xfs_btree_cur *cur, /* btree cursor */
3352 int level, /* level removing record from */
3353 int *stat) /* fail/done/go-on */
3354{
3355 struct xfs_btree_block *block; /* btree block */
3356 union xfs_btree_ptr cptr; /* current block ptr */
3357 struct xfs_buf *bp; /* buffer for block */
3358 int error; /* error return value */
3359 int i; /* loop counter */
3360 union xfs_btree_key key; /* storage for keyp */
3361 union xfs_btree_key *keyp = &key; /* passed to the next level */
3362 union xfs_btree_ptr lptr; /* left sibling block ptr */
3363 struct xfs_buf *lbp; /* left buffer pointer */
3364 struct xfs_btree_block *left; /* left btree block */
3365 int lrecs = 0; /* left record count */
3366 int ptr; /* key/record index */
3367 union xfs_btree_ptr rptr; /* right sibling block ptr */
3368 struct xfs_buf *rbp; /* right buffer pointer */
3369 struct xfs_btree_block *right; /* right btree block */
3370 struct xfs_btree_block *rrblock; /* right-right btree block */
3371 struct xfs_buf *rrbp; /* right-right buffer pointer */
3372 int rrecs = 0; /* right record count */
3373 struct xfs_btree_cur *tcur; /* temporary btree cursor */
3374 int numrecs; /* temporary numrec count */
3375
3376 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3377 XFS_BTREE_TRACE_ARGI(cur, level);
3378
3379 tcur = NULL;
3380
3381 /* Get the index of the entry being deleted, check for nothing there. */
3382 ptr = cur->bc_ptrs[level];
3383 if (ptr == 0) {
3384 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3385 *stat = 0;
3386 return 0;
3387 }
3388
3389 /* Get the buffer & block containing the record or key/ptr. */
3390 block = xfs_btree_get_block(cur, level, &bp);
3391 numrecs = xfs_btree_get_numrecs(block);
3392
3393#ifdef DEBUG
3394 error = xfs_btree_check_block(cur, block, level, bp);
3395 if (error)
3396 goto error0;
3397#endif
3398
3399 /* Fail if we're off the end of the block. */
3400 if (ptr > numrecs) {
3401 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3402 *stat = 0;
3403 return 0;
3404 }
3405
3406 XFS_BTREE_STATS_INC(cur, delrec);
3407 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3408
3409 /* Excise the entries being deleted. */
3410 if (level > 0) {
3411 /* It's a nonleaf. operate on keys and ptrs */
3412 union xfs_btree_key *lkp;
3413 union xfs_btree_ptr *lpp;
3414
3415 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3416 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3417
3418#ifdef DEBUG
3419 for (i = 0; i < numrecs - ptr; i++) {
3420 error = xfs_btree_check_ptr(cur, lpp, i, level);
3421 if (error)
3422 goto error0;
3423 }
3424#endif
3425
3426 if (ptr < numrecs) {
3427 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3428 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3429 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3430 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3431 }
3432
3433 /*
3434 * If it's the first record in the block, we'll need to pass a
3435 * key up to the next level (updkey).
3436 */
3437 if (ptr == 1)
3438 keyp = xfs_btree_key_addr(cur, 1, block);
3439 } else {
3440 /* It's a leaf. operate on records */
3441 if (ptr < numrecs) {
3442 xfs_btree_shift_recs(cur,
3443 xfs_btree_rec_addr(cur, ptr + 1, block),
3444 -1, numrecs - ptr);
3445 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3446 }
3447
3448 /*
3449 * If it's the first record in the block, we'll need a key
3450 * structure to pass up to the next level (updkey).
3451 */
3452 if (ptr == 1) {
3453 cur->bc_ops->init_key_from_rec(&key,
3454 xfs_btree_rec_addr(cur, 1, block));
3455 keyp = &key;
3456 }
3457 }
3458
3459 /*
3460 * Decrement and log the number of entries in the block.
3461 */
3462 xfs_btree_set_numrecs(block, --numrecs);
3463 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3464
3465 /*
3466 * If we are tracking the last record in the tree and
3467 * we are at the far right edge of the tree, update it.
3468 */
3469 if (xfs_btree_is_lastrec(cur, block, level)) {
3470 cur->bc_ops->update_lastrec(cur, block, NULL,
3471 ptr, LASTREC_DELREC);
3472 }
3473
3474 /*
3475 * We're at the root level. First, shrink the root block in-memory.
3476 * Try to get rid of the next level down. If we can't then there's
3477 * nothing left to do.
3478 */
3479 if (level == cur->bc_nlevels - 1) {
3480 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3481 xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3482 cur->bc_private.b.whichfork);
3483
3484 error = xfs_btree_kill_iroot(cur);
3485 if (error)
3486 goto error0;
3487
3488 error = xfs_btree_dec_cursor(cur, level, stat);
3489 if (error)
3490 goto error0;
3491 *stat = 1;
3492 return 0;
3493 }
3494
3495 /*
3496 * If this is the root level, and there's only one entry left,
3497 * and it's NOT the leaf level, then we can get rid of this
3498 * level.
3499 */
3500 if (numrecs == 1 && level > 0) {
3501 union xfs_btree_ptr *pp;
3502 /*
3503 * pp is still set to the first pointer in the block.
3504 * Make it the new root of the btree.
3505 */
3506 pp = xfs_btree_ptr_addr(cur, 1, block);
3507 error = xfs_btree_kill_root(cur, bp, level, pp);
3508 if (error)
3509 goto error0;
3510 } else if (level > 0) {
3511 error = xfs_btree_dec_cursor(cur, level, stat);
3512 if (error)
3513 goto error0;
3514 }
3515 *stat = 1;
3516 return 0;
3517 }
3518
3519 /*
3520 * If we deleted the leftmost entry in the block, update the
3521 * key values above us in the tree.
3522 */
3523 if (ptr == 1) {
3524 error = xfs_btree_updkey(cur, keyp, level + 1);
3525 if (error)
3526 goto error0;
3527 }
3528
3529 /*
3530 * If the number of records remaining in the block is at least
3531 * the minimum, we're done.
3532 */
3533 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3534 error = xfs_btree_dec_cursor(cur, level, stat);
3535 if (error)
3536 goto error0;
3537 return 0;
3538 }
3539
3540 /*
3541 * Otherwise, we have to move some records around to keep the
3542 * tree balanced. Look at the left and right sibling blocks to
3543 * see if we can re-balance by moving only one record.
3544 */
3545 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3546 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3547
3548 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3549 /*
3550 * One child of root, need to get a chance to copy its contents
3551 * into the root and delete it. Can't go up to next level,
3552 * there's nothing to delete there.
3553 */
3554 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3555 xfs_btree_ptr_is_null(cur, &lptr) &&
3556 level == cur->bc_nlevels - 2) {
3557 error = xfs_btree_kill_iroot(cur);
3558 if (!error)
3559 error = xfs_btree_dec_cursor(cur, level, stat);
3560 if (error)
3561 goto error0;
3562 return 0;
3563 }
3564 }
3565
3566 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3567 !xfs_btree_ptr_is_null(cur, &lptr));
3568
3569 /*
3570 * Duplicate the cursor so our btree manipulations here won't
3571 * disrupt the next level up.
3572 */
3573 error = xfs_btree_dup_cursor(cur, &tcur);
3574 if (error)
3575 goto error0;
3576
3577 /*
3578 * If there's a right sibling, see if it's ok to shift an entry
3579 * out of it.
3580 */
3581 if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3582 /*
3583 * Move the temp cursor to the last entry in the next block.
3584 * Actually any entry but the first would suffice.
3585 */
3586 i = xfs_btree_lastrec(tcur, level);
3587 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3588
3589 error = xfs_btree_increment(tcur, level, &i);
3590 if (error)
3591 goto error0;
3592 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3593
3594 i = xfs_btree_lastrec(tcur, level);
3595 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3596
3597 /* Grab a pointer to the block. */
3598 right = xfs_btree_get_block(tcur, level, &rbp);
3599#ifdef DEBUG
3600 error = xfs_btree_check_block(tcur, right, level, rbp);
3601 if (error)
3602 goto error0;
3603#endif
3604 /* Grab the current block number, for future use. */
3605 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3606
3607 /*
3608 * If right block is full enough so that removing one entry
3609 * won't make it too empty, and left-shifting an entry out
3610 * of right to us works, we're done.
3611 */
3612 if (xfs_btree_get_numrecs(right) - 1 >=
3613 cur->bc_ops->get_minrecs(tcur, level)) {
3614 error = xfs_btree_lshift(tcur, level, &i);
3615 if (error)
3616 goto error0;
3617 if (i) {
3618 ASSERT(xfs_btree_get_numrecs(block) >=
3619 cur->bc_ops->get_minrecs(tcur, level));
3620
3621 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3622 tcur = NULL;
3623
3624 error = xfs_btree_dec_cursor(cur, level, stat);
3625 if (error)
3626 goto error0;
3627 return 0;
3628 }
3629 }
3630
3631 /*
3632 * Otherwise, grab the number of records in right for
3633 * future reference, and fix up the temp cursor to point
3634 * to our block again (last record).
3635 */
3636 rrecs = xfs_btree_get_numrecs(right);
3637 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3638 i = xfs_btree_firstrec(tcur, level);
3639 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3640
3641 error = xfs_btree_decrement(tcur, level, &i);
3642 if (error)
3643 goto error0;
3644 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3645 }
3646 }
3647
3648 /*
3649 * If there's a left sibling, see if it's ok to shift an entry
3650 * out of it.
3651 */
3652 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3653 /*
3654 * Move the temp cursor to the first entry in the
3655 * previous block.
3656 */
3657 i = xfs_btree_firstrec(tcur, level);
3658 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3659
3660 error = xfs_btree_decrement(tcur, level, &i);
3661 if (error)
3662 goto error0;
3663 i = xfs_btree_firstrec(tcur, level);
3664 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3665
3666 /* Grab a pointer to the block. */
3667 left = xfs_btree_get_block(tcur, level, &lbp);
3668#ifdef DEBUG
3669 error = xfs_btree_check_block(cur, left, level, lbp);
3670 if (error)
3671 goto error0;
3672#endif
3673 /* Grab the current block number, for future use. */
3674 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3675
3676 /*
3677 * If left block is full enough so that removing one entry
3678 * won't make it too empty, and right-shifting an entry out
3679 * of left to us works, we're done.
3680 */
3681 if (xfs_btree_get_numrecs(left) - 1 >=
3682 cur->bc_ops->get_minrecs(tcur, level)) {
3683 error = xfs_btree_rshift(tcur, level, &i);
3684 if (error)
3685 goto error0;
3686 if (i) {
3687 ASSERT(xfs_btree_get_numrecs(block) >=
3688 cur->bc_ops->get_minrecs(tcur, level));
3689 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3690 tcur = NULL;
3691 if (level == 0)
3692 cur->bc_ptrs[0]++;
3693 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3694 *stat = 1;
3695 return 0;
3696 }
3697 }
3698
3699 /*
3700 * Otherwise, grab the number of records in right for
3701 * future reference.
3702 */
3703 lrecs = xfs_btree_get_numrecs(left);
3704 }
3705
3706 /* Delete the temp cursor, we're done with it. */
3707 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3708 tcur = NULL;
3709
3710 /* If here, we need to do a join to keep the tree balanced. */
3711 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3712
3713 if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3714 lrecs + xfs_btree_get_numrecs(block) <=
3715 cur->bc_ops->get_maxrecs(cur, level)) {
3716 /*
3717 * Set "right" to be the starting block,
3718 * "left" to be the left neighbor.
3719 */
3720 rptr = cptr;
3721 right = block;
3722 rbp = bp;
3723 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3724 if (error)
3725 goto error0;
3726
3727 /*
3728 * If that won't work, see if we can join with the right neighbor block.
3729 */
3730 } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3731 rrecs + xfs_btree_get_numrecs(block) <=
3732 cur->bc_ops->get_maxrecs(cur, level)) {
3733 /*
3734 * Set "left" to be the starting block,
3735 * "right" to be the right neighbor.
3736 */
3737 lptr = cptr;
3738 left = block;
3739 lbp = bp;
3740 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3741 if (error)
3742 goto error0;
3743
3744 /*
3745 * Otherwise, we can't fix the imbalance.
3746 * Just return. This is probably a logic error, but it's not fatal.
3747 */
3748 } else {
3749 error = xfs_btree_dec_cursor(cur, level, stat);
3750 if (error)
3751 goto error0;
3752 return 0;
3753 }
3754
3755 rrecs = xfs_btree_get_numrecs(right);
3756 lrecs = xfs_btree_get_numrecs(left);
3757
3758 /*
3759 * We're now going to join "left" and "right" by moving all the stuff
3760 * in "right" to "left" and deleting "right".
3761 */
3762 XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3763 if (level > 0) {
3764 /* It's a non-leaf. Move keys and pointers. */
3765 union xfs_btree_key *lkp; /* left btree key */
3766 union xfs_btree_ptr *lpp; /* left address pointer */
3767 union xfs_btree_key *rkp; /* right btree key */
3768 union xfs_btree_ptr *rpp; /* right address pointer */
3769
3770 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3771 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3772 rkp = xfs_btree_key_addr(cur, 1, right);
3773 rpp = xfs_btree_ptr_addr(cur, 1, right);
3774#ifdef DEBUG
3775 for (i = 1; i < rrecs; i++) {
3776 error = xfs_btree_check_ptr(cur, rpp, i, level);
3777 if (error)
3778 goto error0;
3779 }
3780#endif
3781 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3782 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3783
3784 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3785 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3786 } else {
3787 /* It's a leaf. Move records. */
3788 union xfs_btree_rec *lrp; /* left record pointer */
3789 union xfs_btree_rec *rrp; /* right record pointer */
3790
3791 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3792 rrp = xfs_btree_rec_addr(cur, 1, right);
3793
3794 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3795 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3796 }
3797
3798 XFS_BTREE_STATS_INC(cur, join);
3799
3800 /*
3801 * Fix up the number of records and right block pointer in the
3802 * surviving block, and log it.
3803 */
3804 xfs_btree_set_numrecs(left, lrecs + rrecs);
3805 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3806 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3807 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3808
3809 /* If there is a right sibling, point it to the remaining block. */
3810 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3811 if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3812 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
3813 if (error)
3814 goto error0;
3815 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3816 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3817 }
3818
3819 /* Free the deleted block. */
3820 error = cur->bc_ops->free_block(cur, rbp);
3821 if (error)
3822 goto error0;
3823 XFS_BTREE_STATS_INC(cur, free);
3824
3825 /*
3826 * If we joined with the left neighbor, set the buffer in the
3827 * cursor to the left block, and fix up the index.
3828 */
3829 if (bp != lbp) {
3830 cur->bc_bufs[level] = lbp;
3831 cur->bc_ptrs[level] += lrecs;
3832 cur->bc_ra[level] = 0;
3833 }
3834 /*
3835 * If we joined with the right neighbor and there's a level above
3836 * us, increment the cursor at that level.
3837 */
3838 else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3839 (level + 1 < cur->bc_nlevels)) {
3840 error = xfs_btree_increment(cur, level + 1, &i);
3841 if (error)
3842 goto error0;
3843 }
3844
3845 /*
3846 * Readjust the ptr at this level if it's not a leaf, since it's
3847 * still pointing at the deletion point, which makes the cursor
3848 * inconsistent. If this makes the ptr 0, the caller fixes it up.
3849 * We can't use decrement because it would change the next level up.
3850 */
3851 if (level > 0)
3852 cur->bc_ptrs[level]--;
3853
3854 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3855 /* Return value means the next level up has something to do. */
3856 *stat = 2;
3857 return 0;
3858
3859error0:
3860 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3861 if (tcur)
3862 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3863 return error;
3864}
3865
3866/*
3867 * Delete the record pointed to by cur.
3868 * The cursor refers to the place where the record was (could be inserted)
3869 * when the operation returns.
3870 */
3871int /* error */
3872xfs_btree_delete(
3873 struct xfs_btree_cur *cur,
3874 int *stat) /* success/failure */
3875{
3876 int error; /* error return value */
3877 int level;
3878 int i;
3879
3880 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3881
3882 /*
3883 * Go up the tree, starting at leaf level.
3884 *
3885 * If 2 is returned then a join was done; go to the next level.
3886 * Otherwise we are done.
3887 */
3888 for (level = 0, i = 2; i == 2; level++) {
3889 error = xfs_btree_delrec(cur, level, &i);
3890 if (error)
3891 goto error0;
3892 }
3893
3894 if (i == 0) {
3895 for (level = 1; level < cur->bc_nlevels; level++) {
3896 if (cur->bc_ptrs[level] == 0) {
3897 error = xfs_btree_decrement(cur, level, &i);
3898 if (error)
3899 goto error0;
3900 break;
3901 }
3902 }
3903 }
3904
3905 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3906 *stat = i;
3907 return 0;
3908error0:
3909 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3910 return error;
3911}
3912
3913/*
3914 * Get the data from the pointed-to record.
3915 */
3916int /* error */
3917xfs_btree_get_rec(
3918 struct xfs_btree_cur *cur, /* btree cursor */
3919 union xfs_btree_rec **recp, /* output: btree record */
3920 int *stat) /* output: success/failure */
3921{
3922 struct xfs_btree_block *block; /* btree block */
3923 struct xfs_buf *bp; /* buffer pointer */
3924 int ptr; /* record number */
3925#ifdef DEBUG
3926 int error; /* error return value */
3927#endif
3928
3929 ptr = cur->bc_ptrs[0];
3930 block = xfs_btree_get_block(cur, 0, &bp);
3931
3932#ifdef DEBUG
3933 error = xfs_btree_check_block(cur, block, 0, bp);
3934 if (error)
3935 return error;
3936#endif
3937
3938 /*
3939 * Off the right end or left end, return failure.
3940 */
3941 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3942 *stat = 0;
3943 return 0;
3944 }
3945
3946 /*
3947 * Point to the record and extract its data.
3948 */
3949 *recp = xfs_btree_rec_addr(cur, ptr, block);
3950 *stat = 1;
3951 return 0;
3952}
3953
3954/*
3955 * Change the owner of a btree.
3956 *
3957 * The mechanism we use here is ordered buffer logging. Because we don't know
3958 * how many buffers were are going to need to modify, we don't really want to
3959 * have to make transaction reservations for the worst case of every buffer in a
3960 * full size btree as that may be more space that we can fit in the log....
3961 *
3962 * We do the btree walk in the most optimal manner possible - we have sibling
3963 * pointers so we can just walk all the blocks on each level from left to right
3964 * in a single pass, and then move to the next level and do the same. We can
3965 * also do readahead on the sibling pointers to get IO moving more quickly,
3966 * though for slow disks this is unlikely to make much difference to performance
3967 * as the amount of CPU work we have to do before moving to the next block is
3968 * relatively small.
3969 *
3970 * For each btree block that we load, modify the owner appropriately, set the
3971 * buffer as an ordered buffer and log it appropriately. We need to ensure that
3972 * we mark the region we change dirty so that if the buffer is relogged in
3973 * a subsequent transaction the changes we make here as an ordered buffer are
3974 * correctly relogged in that transaction. If we are in recovery context, then
3975 * just queue the modified buffer as delayed write buffer so the transaction
3976 * recovery completion writes the changes to disk.
3977 */
3978static int
3979xfs_btree_block_change_owner(
3980 struct xfs_btree_cur *cur,
3981 int level,
3982 __uint64_t new_owner,
3983 struct list_head *buffer_list)
3984{
3985 struct xfs_btree_block *block;
3986 struct xfs_buf *bp;
3987 union xfs_btree_ptr rptr;
3988
3989 /* do right sibling readahead */
3990 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
3991
3992 /* modify the owner */
3993 block = xfs_btree_get_block(cur, level, &bp);
3994 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3995 block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
3996 else
3997 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3998
3999 /*
4000 * If the block is a root block hosted in an inode, we might not have a
4001 * buffer pointer here and we shouldn't attempt to log the change as the
4002 * information is already held in the inode and discarded when the root
4003 * block is formatted into the on-disk inode fork. We still change it,
4004 * though, so everything is consistent in memory.
4005 */
4006 if (bp) {
4007 if (cur->bc_tp) {
4008 xfs_trans_ordered_buf(cur->bc_tp, bp);
4009 xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4010 } else {
4011 xfs_buf_delwri_queue(bp, buffer_list);
4012 }
4013 } else {
4014 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4015 ASSERT(level == cur->bc_nlevels - 1);
4016 }
4017
4018 /* now read rh sibling block for next iteration */
4019 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4020 if (xfs_btree_ptr_is_null(cur, &rptr))
4021 return -ENOENT;
4022
4023 return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4024}
4025
4026int
4027xfs_btree_change_owner(
4028 struct xfs_btree_cur *cur,
4029 __uint64_t new_owner,
4030 struct list_head *buffer_list)
4031{
4032 union xfs_btree_ptr lptr;
4033 int level;
4034 struct xfs_btree_block *block = NULL;
4035 int error = 0;
4036
4037 cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4038
4039 /* for each level */
4040 for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4041 /* grab the left hand block */
4042 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4043 if (error)
4044 return error;
4045
4046 /* readahead the left most block for the next level down */
4047 if (level > 0) {
4048 union xfs_btree_ptr *ptr;
4049
4050 ptr = xfs_btree_ptr_addr(cur, 1, block);
4051 xfs_btree_readahead_ptr(cur, ptr, 1);
4052
4053 /* save for the next iteration of the loop */
4054 lptr = *ptr;
4055 }
4056
4057 /* for each buffer in the level */
4058 do {
4059 error = xfs_btree_block_change_owner(cur, level,
4060 new_owner,
4061 buffer_list);
4062 } while (!error);
4063
4064 if (error != -ENOENT)
4065 return error;
4066 }
4067
4068 return 0;
4069}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
new file mode 100644
index 000000000000..a04b69422f67
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -0,0 +1,468 @@
1/*
2 * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_BTREE_H__
19#define __XFS_BTREE_H__
20
21struct xfs_buf;
22struct xfs_bmap_free;
23struct xfs_inode;
24struct xfs_mount;
25struct xfs_trans;
26
27extern kmem_zone_t *xfs_btree_cur_zone;
28
29/*
30 * Generic key, ptr and record wrapper structures.
31 *
32 * These are disk format structures, and are converted where necessary
33 * by the btree specific code that needs to interpret them.
34 */
35union xfs_btree_ptr {
36 __be32 s; /* short form ptr */
37 __be64 l; /* long form ptr */
38};
39
40union xfs_btree_key {
41 xfs_bmbt_key_t bmbt;
42 xfs_bmdr_key_t bmbr; /* bmbt root block */
43 xfs_alloc_key_t alloc;
44 xfs_inobt_key_t inobt;
45};
46
47union xfs_btree_rec {
48 xfs_bmbt_rec_t bmbt;
49 xfs_bmdr_rec_t bmbr; /* bmbt root block */
50 xfs_alloc_rec_t alloc;
51 xfs_inobt_rec_t inobt;
52};
53
54/*
55 * This nonsense is to make -wlint happy.
56 */
57#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
58#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
59#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
60
61#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
62#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
63#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
64#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
65#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
66
67/*
68 * For logging record fields.
69 */
70#define XFS_BB_MAGIC (1 << 0)
71#define XFS_BB_LEVEL (1 << 1)
72#define XFS_BB_NUMRECS (1 << 2)
73#define XFS_BB_LEFTSIB (1 << 3)
74#define XFS_BB_RIGHTSIB (1 << 4)
75#define XFS_BB_BLKNO (1 << 5)
76#define XFS_BB_LSN (1 << 6)
77#define XFS_BB_UUID (1 << 7)
78#define XFS_BB_OWNER (1 << 8)
79#define XFS_BB_NUM_BITS 5
80#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
81#define XFS_BB_NUM_BITS_CRC 9
82#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
83
84/*
85 * Generic stats interface
86 */
87#define __XFS_BTREE_STATS_INC(type, stat) \
88 XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
89#define XFS_BTREE_STATS_INC(cur, stat) \
90do { \
91 switch (cur->bc_btnum) { \
92 case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
93 case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
94 case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
95 case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
96 case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
97 case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
98 } \
99} while (0)
100
101#define __XFS_BTREE_STATS_ADD(type, stat, val) \
102 XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
103#define XFS_BTREE_STATS_ADD(cur, stat, val) \
104do { \
105 switch (cur->bc_btnum) { \
106 case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
107 case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
108 case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
109 case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
110 case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
111 case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
112 } \
113} while (0)
114
115#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */
116
117struct xfs_btree_ops {
118 /* size of the key and record structures */
119 size_t key_len;
120 size_t rec_len;
121
122 /* cursor operations */
123 struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
124 void (*update_cursor)(struct xfs_btree_cur *src,
125 struct xfs_btree_cur *dst);
126
127 /* update btree root pointer */
128 void (*set_root)(struct xfs_btree_cur *cur,
129 union xfs_btree_ptr *nptr, int level_change);
130
131 /* block allocation / freeing */
132 int (*alloc_block)(struct xfs_btree_cur *cur,
133 union xfs_btree_ptr *start_bno,
134 union xfs_btree_ptr *new_bno,
135 int *stat);
136 int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
137
138 /* update last record information */
139 void (*update_lastrec)(struct xfs_btree_cur *cur,
140 struct xfs_btree_block *block,
141 union xfs_btree_rec *rec,
142 int ptr, int reason);
143
144 /* records in block/level */
145 int (*get_minrecs)(struct xfs_btree_cur *cur, int level);
146 int (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
147
148 /* records on disk. Matter for the root in inode case. */
149 int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
150
151 /* init values of btree structures */
152 void (*init_key_from_rec)(union xfs_btree_key *key,
153 union xfs_btree_rec *rec);
154 void (*init_rec_from_key)(union xfs_btree_key *key,
155 union xfs_btree_rec *rec);
156 void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
157 union xfs_btree_rec *rec);
158 void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
159 union xfs_btree_ptr *ptr);
160
161 /* difference between key value and cursor value */
162 __int64_t (*key_diff)(struct xfs_btree_cur *cur,
163 union xfs_btree_key *key);
164
165 const struct xfs_buf_ops *buf_ops;
166
167#if defined(DEBUG) || defined(XFS_WARN)
168 /* check that k1 is lower than k2 */
169 int (*keys_inorder)(struct xfs_btree_cur *cur,
170 union xfs_btree_key *k1,
171 union xfs_btree_key *k2);
172
173 /* check that r1 is lower than r2 */
174 int (*recs_inorder)(struct xfs_btree_cur *cur,
175 union xfs_btree_rec *r1,
176 union xfs_btree_rec *r2);
177#endif
178};
179
180/*
181 * Reasons for the update_lastrec method to be called.
182 */
183#define LASTREC_UPDATE 0
184#define LASTREC_INSREC 1
185#define LASTREC_DELREC 2
186
187
188/*
189 * Btree cursor structure.
190 * This collects all information needed by the btree code in one place.
191 */
192typedef struct xfs_btree_cur
193{
194 struct xfs_trans *bc_tp; /* transaction we're in, if any */
195 struct xfs_mount *bc_mp; /* file system mount struct */
196 const struct xfs_btree_ops *bc_ops;
197 uint bc_flags; /* btree features - below */
198 union {
199 xfs_alloc_rec_incore_t a;
200 xfs_bmbt_irec_t b;
201 xfs_inobt_rec_incore_t i;
202 } bc_rec; /* current insert/search record value */
203 struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
204 int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
205 __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
206#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */
207#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */
208 __uint8_t bc_nlevels; /* number of levels in the tree */
209 __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
210 xfs_btnum_t bc_btnum; /* identifies which btree type */
211 union {
212 struct { /* needed for BNO, CNT, INO */
213 struct xfs_buf *agbp; /* agf/agi buffer pointer */
214 xfs_agnumber_t agno; /* ag number */
215 } a;
216 struct { /* needed for BMAP */
217 struct xfs_inode *ip; /* pointer to our inode */
218 struct xfs_bmap_free *flist; /* list to free after */
219 xfs_fsblock_t firstblock; /* 1st blk allocated */
220 int allocated; /* count of alloced */
221 short forksize; /* fork's inode space */
222 char whichfork; /* data or attr fork */
223 char flags; /* flags */
224#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
225 } b;
226 } bc_private; /* per-btree type data */
227} xfs_btree_cur_t;
228
229/* cursor flags */
230#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
231#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
232#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
233#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
234
235
236#define XFS_BTREE_NOERROR 0
237#define XFS_BTREE_ERROR 1
238
239/*
240 * Convert from buffer to btree block header.
241 */
242#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
243
244
245/*
246 * Check that block header is ok.
247 */
248int
249xfs_btree_check_block(
250 struct xfs_btree_cur *cur, /* btree cursor */
251 struct xfs_btree_block *block, /* generic btree block pointer */
252 int level, /* level of the btree block */
253 struct xfs_buf *bp); /* buffer containing block, if any */
254
255/*
256 * Check that (long) pointer is ok.
257 */
258int /* error (0 or EFSCORRUPTED) */
259xfs_btree_check_lptr(
260 struct xfs_btree_cur *cur, /* btree cursor */
261 xfs_dfsbno_t ptr, /* btree block disk address */
262 int level); /* btree block level */
263
264/*
265 * Delete the btree cursor.
266 */
267void
268xfs_btree_del_cursor(
269 xfs_btree_cur_t *cur, /* btree cursor */
270 int error); /* del because of error */
271
272/*
273 * Duplicate the btree cursor.
274 * Allocate a new one, copy the record, re-get the buffers.
275 */
276int /* error */
277xfs_btree_dup_cursor(
278 xfs_btree_cur_t *cur, /* input cursor */
279 xfs_btree_cur_t **ncur);/* output cursor */
280
281/*
282 * Get a buffer for the block, return it with no data read.
283 * Long-form addressing.
284 */
285struct xfs_buf * /* buffer for fsbno */
286xfs_btree_get_bufl(
287 struct xfs_mount *mp, /* file system mount point */
288 struct xfs_trans *tp, /* transaction pointer */
289 xfs_fsblock_t fsbno, /* file system block number */
290 uint lock); /* lock flags for get_buf */
291
292/*
293 * Get a buffer for the block, return it with no data read.
294 * Short-form addressing.
295 */
296struct xfs_buf * /* buffer for agno/agbno */
297xfs_btree_get_bufs(
298 struct xfs_mount *mp, /* file system mount point */
299 struct xfs_trans *tp, /* transaction pointer */
300 xfs_agnumber_t agno, /* allocation group number */
301 xfs_agblock_t agbno, /* allocation group block number */
302 uint lock); /* lock flags for get_buf */
303
304/*
305 * Check for the cursor referring to the last block at the given level.
306 */
307int /* 1=is last block, 0=not last block */
308xfs_btree_islastblock(
309 xfs_btree_cur_t *cur, /* btree cursor */
310 int level); /* level to check */
311
312/*
313 * Compute first and last byte offsets for the fields given.
314 * Interprets the offsets table, which contains struct field offsets.
315 */
316void
317xfs_btree_offsets(
318 __int64_t fields, /* bitmask of fields */
319 const short *offsets,/* table of field offsets */
320 int nbits, /* number of bits to inspect */
321 int *first, /* output: first byte offset */
322 int *last); /* output: last byte offset */
323
324/*
325 * Get a buffer for the block, return it read in.
326 * Long-form addressing.
327 */
328int /* error */
329xfs_btree_read_bufl(
330 struct xfs_mount *mp, /* file system mount point */
331 struct xfs_trans *tp, /* transaction pointer */
332 xfs_fsblock_t fsbno, /* file system block number */
333 uint lock, /* lock flags for read_buf */
334 struct xfs_buf **bpp, /* buffer for fsbno */
335 int refval, /* ref count value for buffer */
336 const struct xfs_buf_ops *ops);
337
338/*
339 * Read-ahead the block, don't wait for it, don't return a buffer.
340 * Long-form addressing.
341 */
342void /* error */
343xfs_btree_reada_bufl(
344 struct xfs_mount *mp, /* file system mount point */
345 xfs_fsblock_t fsbno, /* file system block number */
346 xfs_extlen_t count, /* count of filesystem blocks */
347 const struct xfs_buf_ops *ops);
348
349/*
350 * Read-ahead the block, don't wait for it, don't return a buffer.
351 * Short-form addressing.
352 */
353void /* error */
354xfs_btree_reada_bufs(
355 struct xfs_mount *mp, /* file system mount point */
356 xfs_agnumber_t agno, /* allocation group number */
357 xfs_agblock_t agbno, /* allocation group block number */
358 xfs_extlen_t count, /* count of filesystem blocks */
359 const struct xfs_buf_ops *ops);
360
361/*
362 * Initialise a new btree block header
363 */
364void
365xfs_btree_init_block(
366 struct xfs_mount *mp,
367 struct xfs_buf *bp,
368 __u32 magic,
369 __u16 level,
370 __u16 numrecs,
371 __u64 owner,
372 unsigned int flags);
373
374void
375xfs_btree_init_block_int(
376 struct xfs_mount *mp,
377 struct xfs_btree_block *buf,
378 xfs_daddr_t blkno,
379 __u32 magic,
380 __u16 level,
381 __u16 numrecs,
382 __u64 owner,
383 unsigned int flags);
384
385/*
386 * Common btree core entry points.
387 */
388int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
389int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
390int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
391int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
392int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
393int xfs_btree_insert(struct xfs_btree_cur *, int *);
394int xfs_btree_delete(struct xfs_btree_cur *, int *);
395int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
396int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
397 struct list_head *buffer_list);
398
399/*
400 * btree block CRC helpers
401 */
402void xfs_btree_lblock_calc_crc(struct xfs_buf *);
403bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
404void xfs_btree_sblock_calc_crc(struct xfs_buf *);
405bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
406
407/*
408 * Internal btree helpers also used by xfs_bmap.c.
409 */
410void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
411void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
412
413/*
414 * Helpers.
415 */
416static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
417{
418 return be16_to_cpu(block->bb_numrecs);
419}
420
421static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
422 __uint16_t numrecs)
423{
424 block->bb_numrecs = cpu_to_be16(numrecs);
425}
426
427static inline int xfs_btree_get_level(struct xfs_btree_block *block)
428{
429 return be16_to_cpu(block->bb_level);
430}
431
432
433/*
434 * Min and max functions for extlen, agblock, fileoff, and filblks types.
435 */
436#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b))
437#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b))
438#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b))
439#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b))
440#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b))
441#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b))
442#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
443#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
444
445#define XFS_FSB_SANITY_CHECK(mp,fsb) \
446 (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
447 XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
448
449/*
450 * Trace hooks. Currently not implemented as they need to be ported
451 * over to the generic tracing functionality, which is some effort.
452 *
453 * i,j = integer (32 bit)
454 * b = btree block buffer (xfs_buf_t)
455 * p = btree ptr
456 * r = btree record
457 * k = btree key
458 */
459#define XFS_BTREE_TRACE_ARGBI(c, b, i)
460#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
461#define XFS_BTREE_TRACE_ARGI(c, i)
462#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
463#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
464#define XFS_BTREE_TRACE_ARGIK(c, i, k)
465#define XFS_BTREE_TRACE_ARGR(c, r)
466#define XFS_BTREE_TRACE_CURSOR(c, t)
467
468#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -0,0 +1,63 @@
1#ifndef _XFS_CKSUM_H
2#define _XFS_CKSUM_H 1
3
4#define XFS_CRC_SEED (~(__uint32_t)0)
5
6/*
7 * Calculate the intermediate checksum for a buffer that has the CRC field
8 * inside it. The offset of the 32bit crc fields is passed as the
9 * cksum_offset parameter.
10 */
11static inline __uint32_t
12xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
13{
14 __uint32_t zero = 0;
15 __uint32_t crc;
16
17 /* Calculate CRC up to the checksum. */
18 crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
19
20 /* Skip checksum field */
21 crc = crc32c(crc, &zero, sizeof(__u32));
22
23 /* Calculate the rest of the CRC. */
24 return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
25 length - (cksum_offset + sizeof(__be32)));
26}
27
28/*
29 * Convert the intermediate checksum to the final ondisk format.
30 *
31 * The CRC32c calculation uses LE format even on BE machines, but returns the
32 * result in host endian format. Hence we need to byte swap it back to LE format
33 * so that it is consistent on disk.
34 */
35static inline __le32
36xfs_end_cksum(__uint32_t crc)
37{
38 return ~cpu_to_le32(crc);
39}
40
41/*
42 * Helper to generate the checksum for a buffer.
43 */
44static inline void
45xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
46{
47 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
48
49 *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
50}
51
52/*
53 * Helper to verify the checksum for a buffer.
54 */
55static inline int
56xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
57{
58 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
59
60 return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
61}
62
63#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
new file mode 100644
index 000000000000..8d809873525b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -0,0 +1,2665 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_shared.h"
22#include "xfs_format.h"
23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h"
25#include "xfs_bit.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_da_format.h"
30#include "xfs_da_btree.h"
31#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h"
33#include "xfs_inode.h"
34#include "xfs_trans.h"
35#include "xfs_inode_item.h"
36#include "xfs_alloc.h"
37#include "xfs_bmap.h"
38#include "xfs_attr.h"
39#include "xfs_attr_leaf.h"
40#include "xfs_error.h"
41#include "xfs_trace.h"
42#include "xfs_cksum.h"
43#include "xfs_buf_item.h"
44
45/*
46 * xfs_da_btree.c
47 *
48 * Routines to implement directories as Btrees of hashed names.
49 */
50
51/*========================================================================
52 * Function prototypes for the kernel.
53 *========================================================================*/
54
55/*
56 * Routines used for growing the Btree.
57 */
58STATIC int xfs_da3_root_split(xfs_da_state_t *state,
59 xfs_da_state_blk_t *existing_root,
60 xfs_da_state_blk_t *new_child);
61STATIC int xfs_da3_node_split(xfs_da_state_t *state,
62 xfs_da_state_blk_t *existing_blk,
63 xfs_da_state_blk_t *split_blk,
64 xfs_da_state_blk_t *blk_to_add,
65 int treelevel,
66 int *result);
67STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
68 xfs_da_state_blk_t *node_blk_1,
69 xfs_da_state_blk_t *node_blk_2);
70STATIC void xfs_da3_node_add(xfs_da_state_t *state,
71 xfs_da_state_blk_t *old_node_blk,
72 xfs_da_state_blk_t *new_node_blk);
73
74/*
75 * Routines used for shrinking the Btree.
76 */
77STATIC int xfs_da3_root_join(xfs_da_state_t *state,
78 xfs_da_state_blk_t *root_blk);
79STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
80STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
81 xfs_da_state_blk_t *drop_blk);
82STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
83 xfs_da_state_blk_t *src_node_blk,
84 xfs_da_state_blk_t *dst_node_blk);
85
86/*
87 * Utility routines.
88 */
89STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state,
90 xfs_da_state_blk_t *drop_blk,
91 xfs_da_state_blk_t *save_blk);
92
93
94kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
95
96/*
97 * Allocate a dir-state structure.
98 * We don't put them on the stack since they're large.
99 */
100xfs_da_state_t *
101xfs_da_state_alloc(void)
102{
103 return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
104}
105
106/*
107 * Kill the altpath contents of a da-state structure.
108 */
109STATIC void
110xfs_da_state_kill_altpath(xfs_da_state_t *state)
111{
112 int i;
113
114 for (i = 0; i < state->altpath.active; i++)
115 state->altpath.blk[i].bp = NULL;
116 state->altpath.active = 0;
117}
118
119/*
120 * Free a da-state structure.
121 */
122void
123xfs_da_state_free(xfs_da_state_t *state)
124{
125 xfs_da_state_kill_altpath(state);
126#ifdef DEBUG
127 memset((char *)state, 0, sizeof(*state));
128#endif /* DEBUG */
129 kmem_zone_free(xfs_da_state_zone, state);
130}
131
132static bool
133xfs_da3_node_verify(
134 struct xfs_buf *bp)
135{
136 struct xfs_mount *mp = bp->b_target->bt_mount;
137 struct xfs_da_intnode *hdr = bp->b_addr;
138 struct xfs_da3_icnode_hdr ichdr;
139 const struct xfs_dir_ops *ops;
140
141 ops = xfs_dir_get_ops(mp, NULL);
142
143 ops->node_hdr_from_disk(&ichdr, hdr);
144
145 if (xfs_sb_version_hascrc(&mp->m_sb)) {
146 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
147
148 if (ichdr.magic != XFS_DA3_NODE_MAGIC)
149 return false;
150
151 if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
152 return false;
153 if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
154 return false;
155 } else {
156 if (ichdr.magic != XFS_DA_NODE_MAGIC)
157 return false;
158 }
159 if (ichdr.level == 0)
160 return false;
161 if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
162 return false;
163 if (ichdr.count == 0)
164 return false;
165
166 /*
167 * we don't know if the node is for and attribute or directory tree,
168 * so only fail if the count is outside both bounds
169 */
170 if (ichdr.count > mp->m_dir_geo->node_ents &&
171 ichdr.count > mp->m_attr_geo->node_ents)
172 return false;
173
174 /* XXX: hash order check? */
175
176 return true;
177}
178
179static void
180xfs_da3_node_write_verify(
181 struct xfs_buf *bp)
182{
183 struct xfs_mount *mp = bp->b_target->bt_mount;
184 struct xfs_buf_log_item *bip = bp->b_fspriv;
185 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
186
187 if (!xfs_da3_node_verify(bp)) {
188 xfs_buf_ioerror(bp, -EFSCORRUPTED);
189 xfs_verifier_error(bp);
190 return;
191 }
192
193 if (!xfs_sb_version_hascrc(&mp->m_sb))
194 return;
195
196 if (bip)
197 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
198
199 xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
200}
201
202/*
203 * leaf/node format detection on trees is sketchy, so a node read can be done on
204 * leaf level blocks when detection identifies the tree as a node format tree
205 * incorrectly. In this case, we need to swap the verifier to match the correct
206 * format of the block being read.
207 */
208static void
209xfs_da3_node_read_verify(
210 struct xfs_buf *bp)
211{
212 struct xfs_da_blkinfo *info = bp->b_addr;
213
214 switch (be16_to_cpu(info->magic)) {
215 case XFS_DA3_NODE_MAGIC:
216 if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
217 xfs_buf_ioerror(bp, -EFSBADCRC);
218 break;
219 }
220 /* fall through */
221 case XFS_DA_NODE_MAGIC:
222 if (!xfs_da3_node_verify(bp)) {
223 xfs_buf_ioerror(bp, -EFSCORRUPTED);
224 break;
225 }
226 return;
227 case XFS_ATTR_LEAF_MAGIC:
228 case XFS_ATTR3_LEAF_MAGIC:
229 bp->b_ops = &xfs_attr3_leaf_buf_ops;
230 bp->b_ops->verify_read(bp);
231 return;
232 case XFS_DIR2_LEAFN_MAGIC:
233 case XFS_DIR3_LEAFN_MAGIC:
234 bp->b_ops = &xfs_dir3_leafn_buf_ops;
235 bp->b_ops->verify_read(bp);
236 return;
237 default:
238 break;
239 }
240
241 /* corrupt block */
242 xfs_verifier_error(bp);
243}
244
245const struct xfs_buf_ops xfs_da3_node_buf_ops = {
246 .verify_read = xfs_da3_node_read_verify,
247 .verify_write = xfs_da3_node_write_verify,
248};
249
250int
251xfs_da3_node_read(
252 struct xfs_trans *tp,
253 struct xfs_inode *dp,
254 xfs_dablk_t bno,
255 xfs_daddr_t mappedbno,
256 struct xfs_buf **bpp,
257 int which_fork)
258{
259 int err;
260
261 err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
262 which_fork, &xfs_da3_node_buf_ops);
263 if (!err && tp) {
264 struct xfs_da_blkinfo *info = (*bpp)->b_addr;
265 int type;
266
267 switch (be16_to_cpu(info->magic)) {
268 case XFS_DA_NODE_MAGIC:
269 case XFS_DA3_NODE_MAGIC:
270 type = XFS_BLFT_DA_NODE_BUF;
271 break;
272 case XFS_ATTR_LEAF_MAGIC:
273 case XFS_ATTR3_LEAF_MAGIC:
274 type = XFS_BLFT_ATTR_LEAF_BUF;
275 break;
276 case XFS_DIR2_LEAFN_MAGIC:
277 case XFS_DIR3_LEAFN_MAGIC:
278 type = XFS_BLFT_DIR_LEAFN_BUF;
279 break;
280 default:
281 type = 0;
282 ASSERT(0);
283 break;
284 }
285 xfs_trans_buf_set_type(tp, *bpp, type);
286 }
287 return err;
288}
289
290/*========================================================================
291 * Routines used for growing the Btree.
292 *========================================================================*/
293
294/*
295 * Create the initial contents of an intermediate node.
296 */
297int
298xfs_da3_node_create(
299 struct xfs_da_args *args,
300 xfs_dablk_t blkno,
301 int level,
302 struct xfs_buf **bpp,
303 int whichfork)
304{
305 struct xfs_da_intnode *node;
306 struct xfs_trans *tp = args->trans;
307 struct xfs_mount *mp = tp->t_mountp;
308 struct xfs_da3_icnode_hdr ichdr = {0};
309 struct xfs_buf *bp;
310 int error;
311 struct xfs_inode *dp = args->dp;
312
313 trace_xfs_da_node_create(args);
314 ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
315
316 error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
317 if (error)
318 return error;
319 bp->b_ops = &xfs_da3_node_buf_ops;
320 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
321 node = bp->b_addr;
322
323 if (xfs_sb_version_hascrc(&mp->m_sb)) {
324 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
325
326 ichdr.magic = XFS_DA3_NODE_MAGIC;
327 hdr3->info.blkno = cpu_to_be64(bp->b_bn);
328 hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
329 uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
330 } else {
331 ichdr.magic = XFS_DA_NODE_MAGIC;
332 }
333 ichdr.level = level;
334
335 dp->d_ops->node_hdr_to_disk(node, &ichdr);
336 xfs_trans_log_buf(tp, bp,
337 XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
338
339 *bpp = bp;
340 return 0;
341}
342
343/*
344 * Split a leaf node, rebalance, then possibly split
345 * intermediate nodes, rebalance, etc.
346 */
347int /* error */
348xfs_da3_split(
349 struct xfs_da_state *state)
350{
351 struct xfs_da_state_blk *oldblk;
352 struct xfs_da_state_blk *newblk;
353 struct xfs_da_state_blk *addblk;
354 struct xfs_da_intnode *node;
355 struct xfs_buf *bp;
356 int max;
357 int action = 0;
358 int error;
359 int i;
360
361 trace_xfs_da_split(state->args);
362
363 /*
364 * Walk back up the tree splitting/inserting/adjusting as necessary.
365 * If we need to insert and there isn't room, split the node, then
366 * decide which fragment to insert the new block from below into.
367 * Note that we may split the root this way, but we need more fixup.
368 */
369 max = state->path.active - 1;
370 ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
371 ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
372 state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
373
374 addblk = &state->path.blk[max]; /* initial dummy value */
375 for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
376 oldblk = &state->path.blk[i];
377 newblk = &state->altpath.blk[i];
378
379 /*
380 * If a leaf node then
381 * Allocate a new leaf node, then rebalance across them.
382 * else if an intermediate node then
383 * We split on the last layer, must we split the node?
384 */
385 switch (oldblk->magic) {
386 case XFS_ATTR_LEAF_MAGIC:
387 error = xfs_attr3_leaf_split(state, oldblk, newblk);
388 if ((error != 0) && (error != -ENOSPC)) {
389 return error; /* GROT: attr is inconsistent */
390 }
391 if (!error) {
392 addblk = newblk;
393 break;
394 }
395 /*
396 * Entry wouldn't fit, split the leaf again.
397 */
398 state->extravalid = 1;
399 if (state->inleaf) {
400 state->extraafter = 0; /* before newblk */
401 trace_xfs_attr_leaf_split_before(state->args);
402 error = xfs_attr3_leaf_split(state, oldblk,
403 &state->extrablk);
404 } else {
405 state->extraafter = 1; /* after newblk */
406 trace_xfs_attr_leaf_split_after(state->args);
407 error = xfs_attr3_leaf_split(state, newblk,
408 &state->extrablk);
409 }
410 if (error)
411 return error; /* GROT: attr inconsistent */
412 addblk = newblk;
413 break;
414 case XFS_DIR2_LEAFN_MAGIC:
415 error = xfs_dir2_leafn_split(state, oldblk, newblk);
416 if (error)
417 return error;
418 addblk = newblk;
419 break;
420 case XFS_DA_NODE_MAGIC:
421 error = xfs_da3_node_split(state, oldblk, newblk, addblk,
422 max - i, &action);
423 addblk->bp = NULL;
424 if (error)
425 return error; /* GROT: dir is inconsistent */
426 /*
427 * Record the newly split block for the next time thru?
428 */
429 if (action)
430 addblk = newblk;
431 else
432 addblk = NULL;
433 break;
434 }
435
436 /*
437 * Update the btree to show the new hashval for this child.
438 */
439 xfs_da3_fixhashpath(state, &state->path);
440 }
441 if (!addblk)
442 return 0;
443
444 /*
445 * Split the root node.
446 */
447 ASSERT(state->path.active == 0);
448 oldblk = &state->path.blk[0];
449 error = xfs_da3_root_split(state, oldblk, addblk);
450 if (error) {
451 addblk->bp = NULL;
452 return error; /* GROT: dir is inconsistent */
453 }
454
455 /*
456 * Update pointers to the node which used to be block 0 and
457 * just got bumped because of the addition of a new root node.
458 * There might be three blocks involved if a double split occurred,
459 * and the original block 0 could be at any position in the list.
460 *
461 * Note: the magic numbers and sibling pointers are in the same
462 * physical place for both v2 and v3 headers (by design). Hence it
463 * doesn't matter which version of the xfs_da_intnode structure we use
464 * here as the result will be the same using either structure.
465 */
466 node = oldblk->bp->b_addr;
467 if (node->hdr.info.forw) {
468 if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
469 bp = addblk->bp;
470 } else {
471 ASSERT(state->extravalid);
472 bp = state->extrablk.bp;
473 }
474 node = bp->b_addr;
475 node->hdr.info.back = cpu_to_be32(oldblk->blkno);
476 xfs_trans_log_buf(state->args->trans, bp,
477 XFS_DA_LOGRANGE(node, &node->hdr.info,
478 sizeof(node->hdr.info)));
479 }
480 node = oldblk->bp->b_addr;
481 if (node->hdr.info.back) {
482 if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
483 bp = addblk->bp;
484 } else {
485 ASSERT(state->extravalid);
486 bp = state->extrablk.bp;
487 }
488 node = bp->b_addr;
489 node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
490 xfs_trans_log_buf(state->args->trans, bp,
491 XFS_DA_LOGRANGE(node, &node->hdr.info,
492 sizeof(node->hdr.info)));
493 }
494 addblk->bp = NULL;
495 return 0;
496}
497
498/*
499 * Split the root. We have to create a new root and point to the two
500 * parts (the split old root) that we just created. Copy block zero to
501 * the EOF, extending the inode in process.
502 */
503STATIC int /* error */
504xfs_da3_root_split(
505 struct xfs_da_state *state,
506 struct xfs_da_state_blk *blk1,
507 struct xfs_da_state_blk *blk2)
508{
509 struct xfs_da_intnode *node;
510 struct xfs_da_intnode *oldroot;
511 struct xfs_da_node_entry *btree;
512 struct xfs_da3_icnode_hdr nodehdr;
513 struct xfs_da_args *args;
514 struct xfs_buf *bp;
515 struct xfs_inode *dp;
516 struct xfs_trans *tp;
517 struct xfs_mount *mp;
518 struct xfs_dir2_leaf *leaf;
519 xfs_dablk_t blkno;
520 int level;
521 int error;
522 int size;
523
524 trace_xfs_da_root_split(state->args);
525
526 /*
527 * Copy the existing (incorrect) block from the root node position
528 * to a free space somewhere.
529 */
530 args = state->args;
531 error = xfs_da_grow_inode(args, &blkno);
532 if (error)
533 return error;
534
535 dp = args->dp;
536 tp = args->trans;
537 mp = state->mp;
538 error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
539 if (error)
540 return error;
541 node = bp->b_addr;
542 oldroot = blk1->bp->b_addr;
543 if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
544 oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
545 struct xfs_da3_icnode_hdr nodehdr;
546
547 dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
548 btree = dp->d_ops->node_tree_p(oldroot);
549 size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
550 level = nodehdr.level;
551
552 /*
553 * we are about to copy oldroot to bp, so set up the type
554 * of bp while we know exactly what it will be.
555 */
556 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
557 } else {
558 struct xfs_dir3_icleaf_hdr leafhdr;
559 struct xfs_dir2_leaf_entry *ents;
560
561 leaf = (xfs_dir2_leaf_t *)oldroot;
562 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
563 ents = dp->d_ops->leaf_ents_p(leaf);
564
565 ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
566 leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
567 size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
568 level = 0;
569
570 /*
571 * we are about to copy oldroot to bp, so set up the type
572 * of bp while we know exactly what it will be.
573 */
574 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
575 }
576
577 /*
578 * we can copy most of the information in the node from one block to
579 * another, but for CRC enabled headers we have to make sure that the
580 * block specific identifiers are kept intact. We update the buffer
581 * directly for this.
582 */
583 memcpy(node, oldroot, size);
584 if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
585 oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
586 struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
587
588 node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
589 }
590 xfs_trans_log_buf(tp, bp, 0, size - 1);
591
592 bp->b_ops = blk1->bp->b_ops;
593 xfs_trans_buf_copy_type(bp, blk1->bp);
594 blk1->bp = bp;
595 blk1->blkno = blkno;
596
597 /*
598 * Set up the new root node.
599 */
600 error = xfs_da3_node_create(args,
601 (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
602 level + 1, &bp, args->whichfork);
603 if (error)
604 return error;
605
606 node = bp->b_addr;
607 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
608 btree = dp->d_ops->node_tree_p(node);
609 btree[0].hashval = cpu_to_be32(blk1->hashval);
610 btree[0].before = cpu_to_be32(blk1->blkno);
611 btree[1].hashval = cpu_to_be32(blk2->hashval);
612 btree[1].before = cpu_to_be32(blk2->blkno);
613 nodehdr.count = 2;
614 dp->d_ops->node_hdr_to_disk(node, &nodehdr);
615
616#ifdef DEBUG
617 if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
618 oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
619 ASSERT(blk1->blkno >= args->geo->leafblk &&
620 blk1->blkno < args->geo->freeblk);
621 ASSERT(blk2->blkno >= args->geo->leafblk &&
622 blk2->blkno < args->geo->freeblk);
623 }
624#endif
625
626 /* Header is already logged by xfs_da_node_create */
627 xfs_trans_log_buf(tp, bp,
628 XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
629
630 return 0;
631}
632
633/*
634 * Split the node, rebalance, then add the new entry.
635 */
636STATIC int /* error */
637xfs_da3_node_split(
638 struct xfs_da_state *state,
639 struct xfs_da_state_blk *oldblk,
640 struct xfs_da_state_blk *newblk,
641 struct xfs_da_state_blk *addblk,
642 int treelevel,
643 int *result)
644{
645 struct xfs_da_intnode *node;
646 struct xfs_da3_icnode_hdr nodehdr;
647 xfs_dablk_t blkno;
648 int newcount;
649 int error;
650 int useextra;
651 struct xfs_inode *dp = state->args->dp;
652
653 trace_xfs_da_node_split(state->args);
654
655 node = oldblk->bp->b_addr;
656 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
657
658 /*
659 * With V2 dirs the extra block is data or freespace.
660 */
661 useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
662 newcount = 1 + useextra;
663 /*
664 * Do we have to split the node?
665 */
666 if (nodehdr.count + newcount > state->args->geo->node_ents) {
667 /*
668 * Allocate a new node, add to the doubly linked chain of
669 * nodes, then move some of our excess entries into it.
670 */
671 error = xfs_da_grow_inode(state->args, &blkno);
672 if (error)
673 return error; /* GROT: dir is inconsistent */
674
675 error = xfs_da3_node_create(state->args, blkno, treelevel,
676 &newblk->bp, state->args->whichfork);
677 if (error)
678 return error; /* GROT: dir is inconsistent */
679 newblk->blkno = blkno;
680 newblk->magic = XFS_DA_NODE_MAGIC;
681 xfs_da3_node_rebalance(state, oldblk, newblk);
682 error = xfs_da3_blk_link(state, oldblk, newblk);
683 if (error)
684 return error;
685 *result = 1;
686 } else {
687 *result = 0;
688 }
689
690 /*
691 * Insert the new entry(s) into the correct block
692 * (updating last hashval in the process).
693 *
694 * xfs_da3_node_add() inserts BEFORE the given index,
695 * and as a result of using node_lookup_int() we always
696 * point to a valid entry (not after one), but a split
697 * operation always results in a new block whose hashvals
698 * FOLLOW the current block.
699 *
700 * If we had double-split op below us, then add the extra block too.
701 */
702 node = oldblk->bp->b_addr;
703 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
704 if (oldblk->index <= nodehdr.count) {
705 oldblk->index++;
706 xfs_da3_node_add(state, oldblk, addblk);
707 if (useextra) {
708 if (state->extraafter)
709 oldblk->index++;
710 xfs_da3_node_add(state, oldblk, &state->extrablk);
711 state->extravalid = 0;
712 }
713 } else {
714 newblk->index++;
715 xfs_da3_node_add(state, newblk, addblk);
716 if (useextra) {
717 if (state->extraafter)
718 newblk->index++;
719 xfs_da3_node_add(state, newblk, &state->extrablk);
720 state->extravalid = 0;
721 }
722 }
723
724 return 0;
725}
726
727/*
728 * Balance the btree elements between two intermediate nodes,
729 * usually one full and one empty.
730 *
731 * NOTE: if blk2 is empty, then it will get the upper half of blk1.
732 */
733STATIC void
734xfs_da3_node_rebalance(
735 struct xfs_da_state *state,
736 struct xfs_da_state_blk *blk1,
737 struct xfs_da_state_blk *blk2)
738{
739 struct xfs_da_intnode *node1;
740 struct xfs_da_intnode *node2;
741 struct xfs_da_intnode *tmpnode;
742 struct xfs_da_node_entry *btree1;
743 struct xfs_da_node_entry *btree2;
744 struct xfs_da_node_entry *btree_s;
745 struct xfs_da_node_entry *btree_d;
746 struct xfs_da3_icnode_hdr nodehdr1;
747 struct xfs_da3_icnode_hdr nodehdr2;
748 struct xfs_trans *tp;
749 int count;
750 int tmp;
751 int swap = 0;
752 struct xfs_inode *dp = state->args->dp;
753
754 trace_xfs_da_node_rebalance(state->args);
755
756 node1 = blk1->bp->b_addr;
757 node2 = blk2->bp->b_addr;
758 dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
759 dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
760 btree1 = dp->d_ops->node_tree_p(node1);
761 btree2 = dp->d_ops->node_tree_p(node2);
762
763 /*
764 * Figure out how many entries need to move, and in which direction.
765 * Swap the nodes around if that makes it simpler.
766 */
767 if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
768 ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
769 (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
770 be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
771 tmpnode = node1;
772 node1 = node2;
773 node2 = tmpnode;
774 dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
775 dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
776 btree1 = dp->d_ops->node_tree_p(node1);
777 btree2 = dp->d_ops->node_tree_p(node2);
778 swap = 1;
779 }
780
781 count = (nodehdr1.count - nodehdr2.count) / 2;
782 if (count == 0)
783 return;
784 tp = state->args->trans;
785 /*
786 * Two cases: high-to-low and low-to-high.
787 */
788 if (count > 0) {
789 /*
790 * Move elements in node2 up to make a hole.
791 */
792 tmp = nodehdr2.count;
793 if (tmp > 0) {
794 tmp *= (uint)sizeof(xfs_da_node_entry_t);
795 btree_s = &btree2[0];
796 btree_d = &btree2[count];
797 memmove(btree_d, btree_s, tmp);
798 }
799
800 /*
801 * Move the req'd B-tree elements from high in node1 to
802 * low in node2.
803 */
804 nodehdr2.count += count;
805 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
806 btree_s = &btree1[nodehdr1.count - count];
807 btree_d = &btree2[0];
808 memcpy(btree_d, btree_s, tmp);
809 nodehdr1.count -= count;
810 } else {
811 /*
812 * Move the req'd B-tree elements from low in node2 to
813 * high in node1.
814 */
815 count = -count;
816 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
817 btree_s = &btree2[0];
818 btree_d = &btree1[nodehdr1.count];
819 memcpy(btree_d, btree_s, tmp);
820 nodehdr1.count += count;
821
822 xfs_trans_log_buf(tp, blk1->bp,
823 XFS_DA_LOGRANGE(node1, btree_d, tmp));
824
825 /*
826 * Move elements in node2 down to fill the hole.
827 */
828 tmp = nodehdr2.count - count;
829 tmp *= (uint)sizeof(xfs_da_node_entry_t);
830 btree_s = &btree2[count];
831 btree_d = &btree2[0];
832 memmove(btree_d, btree_s, tmp);
833 nodehdr2.count -= count;
834 }
835
836 /*
837 * Log header of node 1 and all current bits of node 2.
838 */
839 dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
840 xfs_trans_log_buf(tp, blk1->bp,
841 XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
842
843 dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
844 xfs_trans_log_buf(tp, blk2->bp,
845 XFS_DA_LOGRANGE(node2, &node2->hdr,
846 dp->d_ops->node_hdr_size +
847 (sizeof(btree2[0]) * nodehdr2.count)));
848
849 /*
850 * Record the last hashval from each block for upward propagation.
851 * (note: don't use the swapped node pointers)
852 */
853 if (swap) {
854 node1 = blk1->bp->b_addr;
855 node2 = blk2->bp->b_addr;
856 dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
857 dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
858 btree1 = dp->d_ops->node_tree_p(node1);
859 btree2 = dp->d_ops->node_tree_p(node2);
860 }
861 blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
862 blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
863
864 /*
865 * Adjust the expected index for insertion.
866 */
867 if (blk1->index >= nodehdr1.count) {
868 blk2->index = blk1->index - nodehdr1.count;
869 blk1->index = nodehdr1.count + 1; /* make it invalid */
870 }
871}
872
873/*
874 * Add a new entry to an intermediate node.
875 */
876STATIC void
877xfs_da3_node_add(
878 struct xfs_da_state *state,
879 struct xfs_da_state_blk *oldblk,
880 struct xfs_da_state_blk *newblk)
881{
882 struct xfs_da_intnode *node;
883 struct xfs_da3_icnode_hdr nodehdr;
884 struct xfs_da_node_entry *btree;
885 int tmp;
886 struct xfs_inode *dp = state->args->dp;
887
888 trace_xfs_da_node_add(state->args);
889
890 node = oldblk->bp->b_addr;
891 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
892 btree = dp->d_ops->node_tree_p(node);
893
894 ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
895 ASSERT(newblk->blkno != 0);
896 if (state->args->whichfork == XFS_DATA_FORK)
897 ASSERT(newblk->blkno >= state->args->geo->leafblk &&
898 newblk->blkno < state->args->geo->freeblk);
899
900 /*
901 * We may need to make some room before we insert the new node.
902 */
903 tmp = 0;
904 if (oldblk->index < nodehdr.count) {
905 tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
906 memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
907 }
908 btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
909 btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
910 xfs_trans_log_buf(state->args->trans, oldblk->bp,
911 XFS_DA_LOGRANGE(node, &btree[oldblk->index],
912 tmp + sizeof(*btree)));
913
914 nodehdr.count += 1;
915 dp->d_ops->node_hdr_to_disk(node, &nodehdr);
916 xfs_trans_log_buf(state->args->trans, oldblk->bp,
917 XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
918
919 /*
920 * Copy the last hash value from the oldblk to propagate upwards.
921 */
922 oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
923}
924
925/*========================================================================
926 * Routines used for shrinking the Btree.
927 *========================================================================*/
928
929/*
930 * Deallocate an empty leaf node, remove it from its parent,
931 * possibly deallocating that block, etc...
932 */
933int
934xfs_da3_join(
935 struct xfs_da_state *state)
936{
937 struct xfs_da_state_blk *drop_blk;
938 struct xfs_da_state_blk *save_blk;
939 int action = 0;
940 int error;
941
942 trace_xfs_da_join(state->args);
943
944 drop_blk = &state->path.blk[ state->path.active-1 ];
945 save_blk = &state->altpath.blk[ state->path.active-1 ];
946 ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
947 ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
948 drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
949
950 /*
951 * Walk back up the tree joining/deallocating as necessary.
952 * When we stop dropping blocks, break out.
953 */
954 for ( ; state->path.active >= 2; drop_blk--, save_blk--,
955 state->path.active--) {
956 /*
957 * See if we can combine the block with a neighbor.
958 * (action == 0) => no options, just leave
959 * (action == 1) => coalesce, then unlink
960 * (action == 2) => block empty, unlink it
961 */
962 switch (drop_blk->magic) {
963 case XFS_ATTR_LEAF_MAGIC:
964 error = xfs_attr3_leaf_toosmall(state, &action);
965 if (error)
966 return error;
967 if (action == 0)
968 return 0;
969 xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
970 break;
971 case XFS_DIR2_LEAFN_MAGIC:
972 error = xfs_dir2_leafn_toosmall(state, &action);
973 if (error)
974 return error;
975 if (action == 0)
976 return 0;
977 xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
978 break;
979 case XFS_DA_NODE_MAGIC:
980 /*
981 * Remove the offending node, fixup hashvals,
982 * check for a toosmall neighbor.
983 */
984 xfs_da3_node_remove(state, drop_blk);
985 xfs_da3_fixhashpath(state, &state->path);
986 error = xfs_da3_node_toosmall(state, &action);
987 if (error)
988 return error;
989 if (action == 0)
990 return 0;
991 xfs_da3_node_unbalance(state, drop_blk, save_blk);
992 break;
993 }
994 xfs_da3_fixhashpath(state, &state->altpath);
995 error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
996 xfs_da_state_kill_altpath(state);
997 if (error)
998 return error;
999 error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
1000 drop_blk->bp);
1001 drop_blk->bp = NULL;
1002 if (error)
1003 return error;
1004 }
1005 /*
1006 * We joined all the way to the top. If it turns out that
1007 * we only have one entry in the root, make the child block
1008 * the new root.
1009 */
1010 xfs_da3_node_remove(state, drop_blk);
1011 xfs_da3_fixhashpath(state, &state->path);
1012 error = xfs_da3_root_join(state, &state->path.blk[0]);
1013 return error;
1014}
1015
1016#ifdef DEBUG
1017static void
1018xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
1019{
1020 __be16 magic = blkinfo->magic;
1021
1022 if (level == 1) {
1023 ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1024 magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
1025 magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
1026 magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
1027 } else {
1028 ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1029 magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
1030 }
1031 ASSERT(!blkinfo->forw);
1032 ASSERT(!blkinfo->back);
1033}
1034#else /* !DEBUG */
1035#define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
1036#endif /* !DEBUG */
1037
1038/*
1039 * We have only one entry in the root. Copy the only remaining child of
1040 * the old root to block 0 as the new root node.
1041 */
1042STATIC int
1043xfs_da3_root_join(
1044 struct xfs_da_state *state,
1045 struct xfs_da_state_blk *root_blk)
1046{
1047 struct xfs_da_intnode *oldroot;
1048 struct xfs_da_args *args;
1049 xfs_dablk_t child;
1050 struct xfs_buf *bp;
1051 struct xfs_da3_icnode_hdr oldroothdr;
1052 struct xfs_da_node_entry *btree;
1053 int error;
1054 struct xfs_inode *dp = state->args->dp;
1055
1056 trace_xfs_da_root_join(state->args);
1057
1058 ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
1059
1060 args = state->args;
1061 oldroot = root_blk->bp->b_addr;
1062 dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
1063 ASSERT(oldroothdr.forw == 0);
1064 ASSERT(oldroothdr.back == 0);
1065
1066 /*
1067 * If the root has more than one child, then don't do anything.
1068 */
1069 if (oldroothdr.count > 1)
1070 return 0;
1071
1072 /*
1073 * Read in the (only) child block, then copy those bytes into
1074 * the root block's buffer and free the original child block.
1075 */
1076 btree = dp->d_ops->node_tree_p(oldroot);
1077 child = be32_to_cpu(btree[0].before);
1078 ASSERT(child != 0);
1079 error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
1080 args->whichfork);
1081 if (error)
1082 return error;
1083 xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
1084
1085 /*
1086 * This could be copying a leaf back into the root block in the case of
1087 * there only being a single leaf block left in the tree. Hence we have
1088 * to update the b_ops pointer as well to match the buffer type change
1089 * that could occur. For dir3 blocks we also need to update the block
1090 * number in the buffer header.
1091 */
1092 memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
1093 root_blk->bp->b_ops = bp->b_ops;
1094 xfs_trans_buf_copy_type(root_blk->bp, bp);
1095 if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
1096 struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
1097 da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
1098 }
1099 xfs_trans_log_buf(args->trans, root_blk->bp, 0,
1100 args->geo->blksize - 1);
1101 error = xfs_da_shrink_inode(args, child, bp);
1102 return error;
1103}
1104
1105/*
1106 * Check a node block and its neighbors to see if the block should be
1107 * collapsed into one or the other neighbor. Always keep the block
1108 * with the smaller block number.
1109 * If the current block is over 50% full, don't try to join it, return 0.
1110 * If the block is empty, fill in the state structure and return 2.
1111 * If it can be collapsed, fill in the state structure and return 1.
1112 * If nothing can be done, return 0.
1113 */
1114STATIC int
1115xfs_da3_node_toosmall(
1116 struct xfs_da_state *state,
1117 int *action)
1118{
1119 struct xfs_da_intnode *node;
1120 struct xfs_da_state_blk *blk;
1121 struct xfs_da_blkinfo *info;
1122 xfs_dablk_t blkno;
1123 struct xfs_buf *bp;
1124 struct xfs_da3_icnode_hdr nodehdr;
1125 int count;
1126 int forward;
1127 int error;
1128 int retval;
1129 int i;
1130 struct xfs_inode *dp = state->args->dp;
1131
1132 trace_xfs_da_node_toosmall(state->args);
1133
1134 /*
1135 * Check for the degenerate case of the block being over 50% full.
1136 * If so, it's not worth even looking to see if we might be able
1137 * to coalesce with a sibling.
1138 */
1139 blk = &state->path.blk[ state->path.active-1 ];
1140 info = blk->bp->b_addr;
1141 node = (xfs_da_intnode_t *)info;
1142 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1143 if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
1144 *action = 0; /* blk over 50%, don't try to join */
1145 return 0; /* blk over 50%, don't try to join */
1146 }
1147
1148 /*
1149 * Check for the degenerate case of the block being empty.
1150 * If the block is empty, we'll simply delete it, no need to
1151 * coalesce it with a sibling block. We choose (arbitrarily)
1152 * to merge with the forward block unless it is NULL.
1153 */
1154 if (nodehdr.count == 0) {
1155 /*
1156 * Make altpath point to the block we want to keep and
1157 * path point to the block we want to drop (this one).
1158 */
1159 forward = (info->forw != 0);
1160 memcpy(&state->altpath, &state->path, sizeof(state->path));
1161 error = xfs_da3_path_shift(state, &state->altpath, forward,
1162 0, &retval);
1163 if (error)
1164 return error;
1165 if (retval) {
1166 *action = 0;
1167 } else {
1168 *action = 2;
1169 }
1170 return 0;
1171 }
1172
1173 /*
1174 * Examine each sibling block to see if we can coalesce with
1175 * at least 25% free space to spare. We need to figure out
1176 * whether to merge with the forward or the backward block.
1177 * We prefer coalescing with the lower numbered sibling so as
1178 * to shrink a directory over time.
1179 */
1180 count = state->args->geo->node_ents;
1181 count -= state->args->geo->node_ents >> 2;
1182 count -= nodehdr.count;
1183
1184 /* start with smaller blk num */
1185 forward = nodehdr.forw < nodehdr.back;
1186 for (i = 0; i < 2; forward = !forward, i++) {
1187 struct xfs_da3_icnode_hdr thdr;
1188 if (forward)
1189 blkno = nodehdr.forw;
1190 else
1191 blkno = nodehdr.back;
1192 if (blkno == 0)
1193 continue;
1194 error = xfs_da3_node_read(state->args->trans, dp,
1195 blkno, -1, &bp, state->args->whichfork);
1196 if (error)
1197 return error;
1198
1199 node = bp->b_addr;
1200 dp->d_ops->node_hdr_from_disk(&thdr, node);
1201 xfs_trans_brelse(state->args->trans, bp);
1202
1203 if (count - thdr.count >= 0)
1204 break; /* fits with at least 25% to spare */
1205 }
1206 if (i >= 2) {
1207 *action = 0;
1208 return 0;
1209 }
1210
1211 /*
1212 * Make altpath point to the block we want to keep (the lower
1213 * numbered block) and path point to the block we want to drop.
1214 */
1215 memcpy(&state->altpath, &state->path, sizeof(state->path));
1216 if (blkno < blk->blkno) {
1217 error = xfs_da3_path_shift(state, &state->altpath, forward,
1218 0, &retval);
1219 } else {
1220 error = xfs_da3_path_shift(state, &state->path, forward,
1221 0, &retval);
1222 }
1223 if (error)
1224 return error;
1225 if (retval) {
1226 *action = 0;
1227 return 0;
1228 }
1229 *action = 1;
1230 return 0;
1231}
1232
1233/*
1234 * Pick up the last hashvalue from an intermediate node.
1235 */
1236STATIC uint
1237xfs_da3_node_lasthash(
1238 struct xfs_inode *dp,
1239 struct xfs_buf *bp,
1240 int *count)
1241{
1242 struct xfs_da_intnode *node;
1243 struct xfs_da_node_entry *btree;
1244 struct xfs_da3_icnode_hdr nodehdr;
1245
1246 node = bp->b_addr;
1247 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1248 if (count)
1249 *count = nodehdr.count;
1250 if (!nodehdr.count)
1251 return 0;
1252 btree = dp->d_ops->node_tree_p(node);
1253 return be32_to_cpu(btree[nodehdr.count - 1].hashval);
1254}
1255
1256/*
1257 * Walk back up the tree adjusting hash values as necessary,
1258 * when we stop making changes, return.
1259 */
1260void
1261xfs_da3_fixhashpath(
1262 struct xfs_da_state *state,
1263 struct xfs_da_state_path *path)
1264{
1265 struct xfs_da_state_blk *blk;
1266 struct xfs_da_intnode *node;
1267 struct xfs_da_node_entry *btree;
1268 xfs_dahash_t lasthash=0;
1269 int level;
1270 int count;
1271 struct xfs_inode *dp = state->args->dp;
1272
1273 trace_xfs_da_fixhashpath(state->args);
1274
1275 level = path->active-1;
1276 blk = &path->blk[ level ];
1277 switch (blk->magic) {
1278 case XFS_ATTR_LEAF_MAGIC:
1279 lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
1280 if (count == 0)
1281 return;
1282 break;
1283 case XFS_DIR2_LEAFN_MAGIC:
1284 lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
1285 if (count == 0)
1286 return;
1287 break;
1288 case XFS_DA_NODE_MAGIC:
1289 lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
1290 if (count == 0)
1291 return;
1292 break;
1293 }
1294 for (blk--, level--; level >= 0; blk--, level--) {
1295 struct xfs_da3_icnode_hdr nodehdr;
1296
1297 node = blk->bp->b_addr;
1298 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1299 btree = dp->d_ops->node_tree_p(node);
1300 if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
1301 break;
1302 blk->hashval = lasthash;
1303 btree[blk->index].hashval = cpu_to_be32(lasthash);
1304 xfs_trans_log_buf(state->args->trans, blk->bp,
1305 XFS_DA_LOGRANGE(node, &btree[blk->index],
1306 sizeof(*btree)));
1307
1308 lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
1309 }
1310}
1311
1312/*
1313 * Remove an entry from an intermediate node.
1314 */
1315STATIC void
1316xfs_da3_node_remove(
1317 struct xfs_da_state *state,
1318 struct xfs_da_state_blk *drop_blk)
1319{
1320 struct xfs_da_intnode *node;
1321 struct xfs_da3_icnode_hdr nodehdr;
1322 struct xfs_da_node_entry *btree;
1323 int index;
1324 int tmp;
1325 struct xfs_inode *dp = state->args->dp;
1326
1327 trace_xfs_da_node_remove(state->args);
1328
1329 node = drop_blk->bp->b_addr;
1330 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1331 ASSERT(drop_blk->index < nodehdr.count);
1332 ASSERT(drop_blk->index >= 0);
1333
1334 /*
1335 * Copy over the offending entry, or just zero it out.
1336 */
1337 index = drop_blk->index;
1338 btree = dp->d_ops->node_tree_p(node);
1339 if (index < nodehdr.count - 1) {
1340 tmp = nodehdr.count - index - 1;
1341 tmp *= (uint)sizeof(xfs_da_node_entry_t);
1342 memmove(&btree[index], &btree[index + 1], tmp);
1343 xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1344 XFS_DA_LOGRANGE(node, &btree[index], tmp));
1345 index = nodehdr.count - 1;
1346 }
1347 memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
1348 xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1349 XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
1350 nodehdr.count -= 1;
1351 dp->d_ops->node_hdr_to_disk(node, &nodehdr);
1352 xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1353 XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
1354
1355 /*
1356 * Copy the last hash value from the block to propagate upwards.
1357 */
1358 drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
1359}
1360
1361/*
1362 * Unbalance the elements between two intermediate nodes,
1363 * move all Btree elements from one node into another.
1364 */
1365STATIC void
1366xfs_da3_node_unbalance(
1367 struct xfs_da_state *state,
1368 struct xfs_da_state_blk *drop_blk,
1369 struct xfs_da_state_blk *save_blk)
1370{
1371 struct xfs_da_intnode *drop_node;
1372 struct xfs_da_intnode *save_node;
1373 struct xfs_da_node_entry *drop_btree;
1374 struct xfs_da_node_entry *save_btree;
1375 struct xfs_da3_icnode_hdr drop_hdr;
1376 struct xfs_da3_icnode_hdr save_hdr;
1377 struct xfs_trans *tp;
1378 int sindex;
1379 int tmp;
1380 struct xfs_inode *dp = state->args->dp;
1381
1382 trace_xfs_da_node_unbalance(state->args);
1383
1384 drop_node = drop_blk->bp->b_addr;
1385 save_node = save_blk->bp->b_addr;
1386 dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
1387 dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
1388 drop_btree = dp->d_ops->node_tree_p(drop_node);
1389 save_btree = dp->d_ops->node_tree_p(save_node);
1390 tp = state->args->trans;
1391
1392 /*
1393 * If the dying block has lower hashvals, then move all the
1394 * elements in the remaining block up to make a hole.
1395 */
1396 if ((be32_to_cpu(drop_btree[0].hashval) <
1397 be32_to_cpu(save_btree[0].hashval)) ||
1398 (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
1399 be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
1400 /* XXX: check this - is memmove dst correct? */
1401 tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
1402 memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
1403
1404 sindex = 0;
1405 xfs_trans_log_buf(tp, save_blk->bp,
1406 XFS_DA_LOGRANGE(save_node, &save_btree[0],
1407 (save_hdr.count + drop_hdr.count) *
1408 sizeof(xfs_da_node_entry_t)));
1409 } else {
1410 sindex = save_hdr.count;
1411 xfs_trans_log_buf(tp, save_blk->bp,
1412 XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
1413 drop_hdr.count * sizeof(xfs_da_node_entry_t)));
1414 }
1415
1416 /*
1417 * Move all the B-tree elements from drop_blk to save_blk.
1418 */
1419 tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
1420 memcpy(&save_btree[sindex], &drop_btree[0], tmp);
1421 save_hdr.count += drop_hdr.count;
1422
1423 dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
1424 xfs_trans_log_buf(tp, save_blk->bp,
1425 XFS_DA_LOGRANGE(save_node, &save_node->hdr,
1426 dp->d_ops->node_hdr_size));
1427
1428 /*
1429 * Save the last hashval in the remaining block for upward propagation.
1430 */
1431 save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
1432}
1433
1434/*========================================================================
1435 * Routines used for finding things in the Btree.
1436 *========================================================================*/
1437
1438/*
1439 * Walk down the Btree looking for a particular filename, filling
1440 * in the state structure as we go.
1441 *
1442 * We will set the state structure to point to each of the elements
1443 * in each of the nodes where either the hashval is or should be.
1444 *
1445 * We support duplicate hashval's so for each entry in the current
1446 * node that could contain the desired hashval, descend. This is a
1447 * pruned depth-first tree search.
1448 */
1449int /* error */
1450xfs_da3_node_lookup_int(
1451 struct xfs_da_state *state,
1452 int *result)
1453{
1454 struct xfs_da_state_blk *blk;
1455 struct xfs_da_blkinfo *curr;
1456 struct xfs_da_intnode *node;
1457 struct xfs_da_node_entry *btree;
1458 struct xfs_da3_icnode_hdr nodehdr;
1459 struct xfs_da_args *args;
1460 xfs_dablk_t blkno;
1461 xfs_dahash_t hashval;
1462 xfs_dahash_t btreehashval;
1463 int probe;
1464 int span;
1465 int max;
1466 int error;
1467 int retval;
1468 struct xfs_inode *dp = state->args->dp;
1469
1470 args = state->args;
1471
1472 /*
1473 * Descend thru the B-tree searching each level for the right
1474 * node to use, until the right hashval is found.
1475 */
1476 blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
1477 for (blk = &state->path.blk[0], state->path.active = 1;
1478 state->path.active <= XFS_DA_NODE_MAXDEPTH;
1479 blk++, state->path.active++) {
1480 /*
1481 * Read the next node down in the tree.
1482 */
1483 blk->blkno = blkno;
1484 error = xfs_da3_node_read(args->trans, args->dp, blkno,
1485 -1, &blk->bp, args->whichfork);
1486 if (error) {
1487 blk->blkno = 0;
1488 state->path.active--;
1489 return error;
1490 }
1491 curr = blk->bp->b_addr;
1492 blk->magic = be16_to_cpu(curr->magic);
1493
1494 if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
1495 blk->magic == XFS_ATTR3_LEAF_MAGIC) {
1496 blk->magic = XFS_ATTR_LEAF_MAGIC;
1497 blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1498 break;
1499 }
1500
1501 if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1502 blk->magic == XFS_DIR3_LEAFN_MAGIC) {
1503 blk->magic = XFS_DIR2_LEAFN_MAGIC;
1504 blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
1505 blk->bp, NULL);
1506 break;
1507 }
1508
1509 blk->magic = XFS_DA_NODE_MAGIC;
1510
1511
1512 /*
1513 * Search an intermediate node for a match.
1514 */
1515 node = blk->bp->b_addr;
1516 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1517 btree = dp->d_ops->node_tree_p(node);
1518
1519 max = nodehdr.count;
1520 blk->hashval = be32_to_cpu(btree[max - 1].hashval);
1521
1522 /*
1523 * Binary search. (note: small blocks will skip loop)
1524 */
1525 probe = span = max / 2;
1526 hashval = args->hashval;
1527 while (span > 4) {
1528 span /= 2;
1529 btreehashval = be32_to_cpu(btree[probe].hashval);
1530 if (btreehashval < hashval)
1531 probe += span;
1532 else if (btreehashval > hashval)
1533 probe -= span;
1534 else
1535 break;
1536 }
1537 ASSERT((probe >= 0) && (probe < max));
1538 ASSERT((span <= 4) ||
1539 (be32_to_cpu(btree[probe].hashval) == hashval));
1540
1541 /*
1542 * Since we may have duplicate hashval's, find the first
1543 * matching hashval in the node.
1544 */
1545 while (probe > 0 &&
1546 be32_to_cpu(btree[probe].hashval) >= hashval) {
1547 probe--;
1548 }
1549 while (probe < max &&
1550 be32_to_cpu(btree[probe].hashval) < hashval) {
1551 probe++;
1552 }
1553
1554 /*
1555 * Pick the right block to descend on.
1556 */
1557 if (probe == max) {
1558 blk->index = max - 1;
1559 blkno = be32_to_cpu(btree[max - 1].before);
1560 } else {
1561 blk->index = probe;
1562 blkno = be32_to_cpu(btree[probe].before);
1563 }
1564 }
1565
1566 /*
1567 * A leaf block that ends in the hashval that we are interested in
1568 * (final hashval == search hashval) means that the next block may
1569 * contain more entries with the same hashval, shift upward to the
1570 * next leaf and keep searching.
1571 */
1572 for (;;) {
1573 if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
1574 retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
1575 &blk->index, state);
1576 } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1577 retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
1578 blk->index = args->index;
1579 args->blkno = blk->blkno;
1580 } else {
1581 ASSERT(0);
1582 return -EFSCORRUPTED;
1583 }
1584 if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
1585 (blk->hashval == args->hashval)) {
1586 error = xfs_da3_path_shift(state, &state->path, 1, 1,
1587 &retval);
1588 if (error)
1589 return error;
1590 if (retval == 0) {
1591 continue;
1592 } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1593 /* path_shift() gives ENOENT */
1594 retval = -ENOATTR;
1595 }
1596 }
1597 break;
1598 }
1599 *result = retval;
1600 return 0;
1601}
1602
1603/*========================================================================
1604 * Utility routines.
1605 *========================================================================*/
1606
1607/*
1608 * Compare two intermediate nodes for "order".
1609 */
1610STATIC int
1611xfs_da3_node_order(
1612 struct xfs_inode *dp,
1613 struct xfs_buf *node1_bp,
1614 struct xfs_buf *node2_bp)
1615{
1616 struct xfs_da_intnode *node1;
1617 struct xfs_da_intnode *node2;
1618 struct xfs_da_node_entry *btree1;
1619 struct xfs_da_node_entry *btree2;
1620 struct xfs_da3_icnode_hdr node1hdr;
1621 struct xfs_da3_icnode_hdr node2hdr;
1622
1623 node1 = node1_bp->b_addr;
1624 node2 = node2_bp->b_addr;
1625 dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
1626 dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
1627 btree1 = dp->d_ops->node_tree_p(node1);
1628 btree2 = dp->d_ops->node_tree_p(node2);
1629
1630 if (node1hdr.count > 0 && node2hdr.count > 0 &&
1631 ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
1632 (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
1633 be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
1634 return 1;
1635 }
1636 return 0;
1637}
1638
1639/*
1640 * Link a new block into a doubly linked list of blocks (of whatever type).
1641 */
1642int /* error */
1643xfs_da3_blk_link(
1644 struct xfs_da_state *state,
1645 struct xfs_da_state_blk *old_blk,
1646 struct xfs_da_state_blk *new_blk)
1647{
1648 struct xfs_da_blkinfo *old_info;
1649 struct xfs_da_blkinfo *new_info;
1650 struct xfs_da_blkinfo *tmp_info;
1651 struct xfs_da_args *args;
1652 struct xfs_buf *bp;
1653 int before = 0;
1654 int error;
1655 struct xfs_inode *dp = state->args->dp;
1656
1657 /*
1658 * Set up environment.
1659 */
1660 args = state->args;
1661 ASSERT(args != NULL);
1662 old_info = old_blk->bp->b_addr;
1663 new_info = new_blk->bp->b_addr;
1664 ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
1665 old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1666 old_blk->magic == XFS_ATTR_LEAF_MAGIC);
1667
1668 switch (old_blk->magic) {
1669 case XFS_ATTR_LEAF_MAGIC:
1670 before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
1671 break;
1672 case XFS_DIR2_LEAFN_MAGIC:
1673 before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
1674 break;
1675 case XFS_DA_NODE_MAGIC:
1676 before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
1677 break;
1678 }
1679
1680 /*
1681 * Link blocks in appropriate order.
1682 */
1683 if (before) {
1684 /*
1685 * Link new block in before existing block.
1686 */
1687 trace_xfs_da_link_before(args);
1688 new_info->forw = cpu_to_be32(old_blk->blkno);
1689 new_info->back = old_info->back;
1690 if (old_info->back) {
1691 error = xfs_da3_node_read(args->trans, dp,
1692 be32_to_cpu(old_info->back),
1693 -1, &bp, args->whichfork);
1694 if (error)
1695 return error;
1696 ASSERT(bp != NULL);
1697 tmp_info = bp->b_addr;
1698 ASSERT(tmp_info->magic == old_info->magic);
1699 ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
1700 tmp_info->forw = cpu_to_be32(new_blk->blkno);
1701 xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1702 }
1703 old_info->back = cpu_to_be32(new_blk->blkno);
1704 } else {
1705 /*
1706 * Link new block in after existing block.
1707 */
1708 trace_xfs_da_link_after(args);
1709 new_info->forw = old_info->forw;
1710 new_info->back = cpu_to_be32(old_blk->blkno);
1711 if (old_info->forw) {
1712 error = xfs_da3_node_read(args->trans, dp,
1713 be32_to_cpu(old_info->forw),
1714 -1, &bp, args->whichfork);
1715 if (error)
1716 return error;
1717 ASSERT(bp != NULL);
1718 tmp_info = bp->b_addr;
1719 ASSERT(tmp_info->magic == old_info->magic);
1720 ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
1721 tmp_info->back = cpu_to_be32(new_blk->blkno);
1722 xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1723 }
1724 old_info->forw = cpu_to_be32(new_blk->blkno);
1725 }
1726
1727 xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
1728 xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
1729 return 0;
1730}
1731
1732/*
1733 * Unlink a block from a doubly linked list of blocks.
1734 */
1735STATIC int /* error */
1736xfs_da3_blk_unlink(
1737 struct xfs_da_state *state,
1738 struct xfs_da_state_blk *drop_blk,
1739 struct xfs_da_state_blk *save_blk)
1740{
1741 struct xfs_da_blkinfo *drop_info;
1742 struct xfs_da_blkinfo *save_info;
1743 struct xfs_da_blkinfo *tmp_info;
1744 struct xfs_da_args *args;
1745 struct xfs_buf *bp;
1746 int error;
1747
1748 /*
1749 * Set up environment.
1750 */
1751 args = state->args;
1752 ASSERT(args != NULL);
1753 save_info = save_blk->bp->b_addr;
1754 drop_info = drop_blk->bp->b_addr;
1755 ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
1756 save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1757 save_blk->magic == XFS_ATTR_LEAF_MAGIC);
1758 ASSERT(save_blk->magic == drop_blk->magic);
1759 ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
1760 (be32_to_cpu(save_info->back) == drop_blk->blkno));
1761 ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
1762 (be32_to_cpu(drop_info->back) == save_blk->blkno));
1763
1764 /*
1765 * Unlink the leaf block from the doubly linked chain of leaves.
1766 */
1767 if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
1768 trace_xfs_da_unlink_back(args);
1769 save_info->back = drop_info->back;
1770 if (drop_info->back) {
1771 error = xfs_da3_node_read(args->trans, args->dp,
1772 be32_to_cpu(drop_info->back),
1773 -1, &bp, args->whichfork);
1774 if (error)
1775 return error;
1776 ASSERT(bp != NULL);
1777 tmp_info = bp->b_addr;
1778 ASSERT(tmp_info->magic == save_info->magic);
1779 ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
1780 tmp_info->forw = cpu_to_be32(save_blk->blkno);
1781 xfs_trans_log_buf(args->trans, bp, 0,
1782 sizeof(*tmp_info) - 1);
1783 }
1784 } else {
1785 trace_xfs_da_unlink_forward(args);
1786 save_info->forw = drop_info->forw;
1787 if (drop_info->forw) {
1788 error = xfs_da3_node_read(args->trans, args->dp,
1789 be32_to_cpu(drop_info->forw),
1790 -1, &bp, args->whichfork);
1791 if (error)
1792 return error;
1793 ASSERT(bp != NULL);
1794 tmp_info = bp->b_addr;
1795 ASSERT(tmp_info->magic == save_info->magic);
1796 ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
1797 tmp_info->back = cpu_to_be32(save_blk->blkno);
1798 xfs_trans_log_buf(args->trans, bp, 0,
1799 sizeof(*tmp_info) - 1);
1800 }
1801 }
1802
1803 xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
1804 return 0;
1805}
1806
1807/*
1808 * Move a path "forward" or "!forward" one block at the current level.
1809 *
1810 * This routine will adjust a "path" to point to the next block
1811 * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
1812 * Btree, including updating pointers to the intermediate nodes between
1813 * the new bottom and the root.
1814 */
1815int /* error */
1816xfs_da3_path_shift(
1817 struct xfs_da_state *state,
1818 struct xfs_da_state_path *path,
1819 int forward,
1820 int release,
1821 int *result)
1822{
1823 struct xfs_da_state_blk *blk;
1824 struct xfs_da_blkinfo *info;
1825 struct xfs_da_intnode *node;
1826 struct xfs_da_args *args;
1827 struct xfs_da_node_entry *btree;
1828 struct xfs_da3_icnode_hdr nodehdr;
1829 xfs_dablk_t blkno = 0;
1830 int level;
1831 int error;
1832 struct xfs_inode *dp = state->args->dp;
1833
1834 trace_xfs_da_path_shift(state->args);
1835
1836 /*
1837 * Roll up the Btree looking for the first block where our
1838 * current index is not at the edge of the block. Note that
1839 * we skip the bottom layer because we want the sibling block.
1840 */
1841 args = state->args;
1842 ASSERT(args != NULL);
1843 ASSERT(path != NULL);
1844 ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1845 level = (path->active-1) - 1; /* skip bottom layer in path */
1846 for (blk = &path->blk[level]; level >= 0; blk--, level--) {
1847 node = blk->bp->b_addr;
1848 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1849 btree = dp->d_ops->node_tree_p(node);
1850
1851 if (forward && (blk->index < nodehdr.count - 1)) {
1852 blk->index++;
1853 blkno = be32_to_cpu(btree[blk->index].before);
1854 break;
1855 } else if (!forward && (blk->index > 0)) {
1856 blk->index--;
1857 blkno = be32_to_cpu(btree[blk->index].before);
1858 break;
1859 }
1860 }
1861 if (level < 0) {
1862 *result = -ENOENT; /* we're out of our tree */
1863 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
1864 return 0;
1865 }
1866
1867 /*
1868 * Roll down the edge of the subtree until we reach the
1869 * same depth we were at originally.
1870 */
1871 for (blk++, level++; level < path->active; blk++, level++) {
1872 /*
1873 * Release the old block.
1874 * (if it's dirty, trans won't actually let go)
1875 */
1876 if (release)
1877 xfs_trans_brelse(args->trans, blk->bp);
1878
1879 /*
1880 * Read the next child block.
1881 */
1882 blk->blkno = blkno;
1883 error = xfs_da3_node_read(args->trans, dp, blkno, -1,
1884 &blk->bp, args->whichfork);
1885 if (error)
1886 return error;
1887 info = blk->bp->b_addr;
1888 ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1889 info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
1890 info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1891 info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
1892 info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
1893 info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
1894
1895
1896 /*
1897 * Note: we flatten the magic number to a single type so we
1898 * don't have to compare against crc/non-crc types elsewhere.
1899 */
1900 switch (be16_to_cpu(info->magic)) {
1901 case XFS_DA_NODE_MAGIC:
1902 case XFS_DA3_NODE_MAGIC:
1903 blk->magic = XFS_DA_NODE_MAGIC;
1904 node = (xfs_da_intnode_t *)info;
1905 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1906 btree = dp->d_ops->node_tree_p(node);
1907 blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
1908 if (forward)
1909 blk->index = 0;
1910 else
1911 blk->index = nodehdr.count - 1;
1912 blkno = be32_to_cpu(btree[blk->index].before);
1913 break;
1914 case XFS_ATTR_LEAF_MAGIC:
1915 case XFS_ATTR3_LEAF_MAGIC:
1916 blk->magic = XFS_ATTR_LEAF_MAGIC;
1917 ASSERT(level == path->active-1);
1918 blk->index = 0;
1919 blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1920 break;
1921 case XFS_DIR2_LEAFN_MAGIC:
1922 case XFS_DIR3_LEAFN_MAGIC:
1923 blk->magic = XFS_DIR2_LEAFN_MAGIC;
1924 ASSERT(level == path->active-1);
1925 blk->index = 0;
1926 blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
1927 blk->bp, NULL);
1928 break;
1929 default:
1930 ASSERT(0);
1931 break;
1932 }
1933 }
1934 *result = 0;
1935 return 0;
1936}
1937
1938
1939/*========================================================================
1940 * Utility routines.
1941 *========================================================================*/
1942
1943/*
1944 * Implement a simple hash on a character string.
1945 * Rotate the hash value by 7 bits, then XOR each character in.
1946 * This is implemented with some source-level loop unrolling.
1947 */
1948xfs_dahash_t
1949xfs_da_hashname(const __uint8_t *name, int namelen)
1950{
1951 xfs_dahash_t hash;
1952
1953 /*
1954 * Do four characters at a time as long as we can.
1955 */
1956 for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
1957 hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
1958 (name[3] << 0) ^ rol32(hash, 7 * 4);
1959
1960 /*
1961 * Now do the rest of the characters.
1962 */
1963 switch (namelen) {
1964 case 3:
1965 return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
1966 rol32(hash, 7 * 3);
1967 case 2:
1968 return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
1969 case 1:
1970 return (name[0] << 0) ^ rol32(hash, 7 * 1);
1971 default: /* case 0: */
1972 return hash;
1973 }
1974}
1975
1976enum xfs_dacmp
1977xfs_da_compname(
1978 struct xfs_da_args *args,
1979 const unsigned char *name,
1980 int len)
1981{
1982 return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
1983 XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
1984}
1985
1986static xfs_dahash_t
1987xfs_default_hashname(
1988 struct xfs_name *name)
1989{
1990 return xfs_da_hashname(name->name, name->len);
1991}
1992
1993const struct xfs_nameops xfs_default_nameops = {
1994 .hashname = xfs_default_hashname,
1995 .compname = xfs_da_compname
1996};
1997
1998int
1999xfs_da_grow_inode_int(
2000 struct xfs_da_args *args,
2001 xfs_fileoff_t *bno,
2002 int count)
2003{
2004 struct xfs_trans *tp = args->trans;
2005 struct xfs_inode *dp = args->dp;
2006 int w = args->whichfork;
2007 xfs_drfsbno_t nblks = dp->i_d.di_nblocks;
2008 struct xfs_bmbt_irec map, *mapp;
2009 int nmap, error, got, i, mapi;
2010
2011 /*
2012 * Find a spot in the file space to put the new block.
2013 */
2014 error = xfs_bmap_first_unused(tp, dp, count, bno, w);
2015 if (error)
2016 return error;
2017
2018 /*
2019 * Try mapping it in one filesystem block.
2020 */
2021 nmap = 1;
2022 ASSERT(args->firstblock != NULL);
2023 error = xfs_bmapi_write(tp, dp, *bno, count,
2024 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
2025 args->firstblock, args->total, &map, &nmap,
2026 args->flist);
2027 if (error)
2028 return error;
2029
2030 ASSERT(nmap <= 1);
2031 if (nmap == 1) {
2032 mapp = &map;
2033 mapi = 1;
2034 } else if (nmap == 0 && count > 1) {
2035 xfs_fileoff_t b;
2036 int c;
2037
2038 /*
2039 * If we didn't get it and the block might work if fragmented,
2040 * try without the CONTIG flag. Loop until we get it all.
2041 */
2042 mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
2043 for (b = *bno, mapi = 0; b < *bno + count; ) {
2044 nmap = MIN(XFS_BMAP_MAX_NMAP, count);
2045 c = (int)(*bno + count - b);
2046 error = xfs_bmapi_write(tp, dp, b, c,
2047 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
2048 args->firstblock, args->total,
2049 &mapp[mapi], &nmap, args->flist);
2050 if (error)
2051 goto out_free_map;
2052 if (nmap < 1)
2053 break;
2054 mapi += nmap;
2055 b = mapp[mapi - 1].br_startoff +
2056 mapp[mapi - 1].br_blockcount;
2057 }
2058 } else {
2059 mapi = 0;
2060 mapp = NULL;
2061 }
2062
2063 /*
2064 * Count the blocks we got, make sure it matches the total.
2065 */
2066 for (i = 0, got = 0; i < mapi; i++)
2067 got += mapp[i].br_blockcount;
2068 if (got != count || mapp[0].br_startoff != *bno ||
2069 mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
2070 *bno + count) {
2071 error = -ENOSPC;
2072 goto out_free_map;
2073 }
2074
2075 /* account for newly allocated blocks in reserved blocks total */
2076 args->total -= dp->i_d.di_nblocks - nblks;
2077
2078out_free_map:
2079 if (mapp != &map)
2080 kmem_free(mapp);
2081 return error;
2082}
2083
2084/*
2085 * Add a block to the btree ahead of the file.
2086 * Return the new block number to the caller.
2087 */
2088int
2089xfs_da_grow_inode(
2090 struct xfs_da_args *args,
2091 xfs_dablk_t *new_blkno)
2092{
2093 xfs_fileoff_t bno;
2094 int error;
2095
2096 trace_xfs_da_grow_inode(args);
2097
2098 bno = args->geo->leafblk;
2099 error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
2100 if (!error)
2101 *new_blkno = (xfs_dablk_t)bno;
2102 return error;
2103}
2104
2105/*
2106 * Ick. We need to always be able to remove a btree block, even
2107 * if there's no space reservation because the filesystem is full.
2108 * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
2109 * It swaps the target block with the last block in the file. The
2110 * last block in the file can always be removed since it can't cause
2111 * a bmap btree split to do that.
2112 */
2113STATIC int
2114xfs_da3_swap_lastblock(
2115 struct xfs_da_args *args,
2116 xfs_dablk_t *dead_blknop,
2117 struct xfs_buf **dead_bufp)
2118{
2119 struct xfs_da_blkinfo *dead_info;
2120 struct xfs_da_blkinfo *sib_info;
2121 struct xfs_da_intnode *par_node;
2122 struct xfs_da_intnode *dead_node;
2123 struct xfs_dir2_leaf *dead_leaf2;
2124 struct xfs_da_node_entry *btree;
2125 struct xfs_da3_icnode_hdr par_hdr;
2126 struct xfs_inode *dp;
2127 struct xfs_trans *tp;
2128 struct xfs_mount *mp;
2129 struct xfs_buf *dead_buf;
2130 struct xfs_buf *last_buf;
2131 struct xfs_buf *sib_buf;
2132 struct xfs_buf *par_buf;
2133 xfs_dahash_t dead_hash;
2134 xfs_fileoff_t lastoff;
2135 xfs_dablk_t dead_blkno;
2136 xfs_dablk_t last_blkno;
2137 xfs_dablk_t sib_blkno;
2138 xfs_dablk_t par_blkno;
2139 int error;
2140 int w;
2141 int entno;
2142 int level;
2143 int dead_level;
2144
2145 trace_xfs_da_swap_lastblock(args);
2146
2147 dead_buf = *dead_bufp;
2148 dead_blkno = *dead_blknop;
2149 tp = args->trans;
2150 dp = args->dp;
2151 w = args->whichfork;
2152 ASSERT(w == XFS_DATA_FORK);
2153 mp = dp->i_mount;
2154 lastoff = args->geo->freeblk;
2155 error = xfs_bmap_last_before(tp, dp, &lastoff, w);
2156 if (error)
2157 return error;
2158 if (unlikely(lastoff == 0)) {
2159 XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
2160 mp);
2161 return -EFSCORRUPTED;
2162 }
2163 /*
2164 * Read the last block in the btree space.
2165 */
2166 last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
2167 error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
2168 if (error)
2169 return error;
2170 /*
2171 * Copy the last block into the dead buffer and log it.
2172 */
2173 memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
2174 xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
2175 dead_info = dead_buf->b_addr;
2176 /*
2177 * Get values from the moved block.
2178 */
2179 if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
2180 dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
2181 struct xfs_dir3_icleaf_hdr leafhdr;
2182 struct xfs_dir2_leaf_entry *ents;
2183
2184 dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
2185 dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
2186 ents = dp->d_ops->leaf_ents_p(dead_leaf2);
2187 dead_level = 0;
2188 dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
2189 } else {
2190 struct xfs_da3_icnode_hdr deadhdr;
2191
2192 dead_node = (xfs_da_intnode_t *)dead_info;
2193 dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
2194 btree = dp->d_ops->node_tree_p(dead_node);
2195 dead_level = deadhdr.level;
2196 dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
2197 }
2198 sib_buf = par_buf = NULL;
2199 /*
2200 * If the moved block has a left sibling, fix up the pointers.
2201 */
2202 if ((sib_blkno = be32_to_cpu(dead_info->back))) {
2203 error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
2204 if (error)
2205 goto done;
2206 sib_info = sib_buf->b_addr;
2207 if (unlikely(
2208 be32_to_cpu(sib_info->forw) != last_blkno ||
2209 sib_info->magic != dead_info->magic)) {
2210 XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
2211 XFS_ERRLEVEL_LOW, mp);
2212 error = -EFSCORRUPTED;
2213 goto done;
2214 }
2215 sib_info->forw = cpu_to_be32(dead_blkno);
2216 xfs_trans_log_buf(tp, sib_buf,
2217 XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
2218 sizeof(sib_info->forw)));
2219 sib_buf = NULL;
2220 }
2221 /*
2222 * If the moved block has a right sibling, fix up the pointers.
2223 */
2224 if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
2225 error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
2226 if (error)
2227 goto done;
2228 sib_info = sib_buf->b_addr;
2229 if (unlikely(
2230 be32_to_cpu(sib_info->back) != last_blkno ||
2231 sib_info->magic != dead_info->magic)) {
2232 XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
2233 XFS_ERRLEVEL_LOW, mp);
2234 error = -EFSCORRUPTED;
2235 goto done;
2236 }
2237 sib_info->back = cpu_to_be32(dead_blkno);
2238 xfs_trans_log_buf(tp, sib_buf,
2239 XFS_DA_LOGRANGE(sib_info, &sib_info->back,
2240 sizeof(sib_info->back)));
2241 sib_buf = NULL;
2242 }
2243 par_blkno = args->geo->leafblk;
2244 level = -1;
2245 /*
2246 * Walk down the tree looking for the parent of the moved block.
2247 */
2248 for (;;) {
2249 error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
2250 if (error)
2251 goto done;
2252 par_node = par_buf->b_addr;
2253 dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
2254 if (level >= 0 && level != par_hdr.level + 1) {
2255 XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
2256 XFS_ERRLEVEL_LOW, mp);
2257 error = -EFSCORRUPTED;
2258 goto done;
2259 }
2260 level = par_hdr.level;
2261 btree = dp->d_ops->node_tree_p(par_node);
2262 for (entno = 0;
2263 entno < par_hdr.count &&
2264 be32_to_cpu(btree[entno].hashval) < dead_hash;
2265 entno++)
2266 continue;
2267 if (entno == par_hdr.count) {
2268 XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
2269 XFS_ERRLEVEL_LOW, mp);
2270 error = -EFSCORRUPTED;
2271 goto done;
2272 }
2273 par_blkno = be32_to_cpu(btree[entno].before);
2274 if (level == dead_level + 1)
2275 break;
2276 xfs_trans_brelse(tp, par_buf);
2277 par_buf = NULL;
2278 }
2279 /*
2280 * We're in the right parent block.
2281 * Look for the right entry.
2282 */
2283 for (;;) {
2284 for (;
2285 entno < par_hdr.count &&
2286 be32_to_cpu(btree[entno].before) != last_blkno;
2287 entno++)
2288 continue;
2289 if (entno < par_hdr.count)
2290 break;
2291 par_blkno = par_hdr.forw;
2292 xfs_trans_brelse(tp, par_buf);
2293 par_buf = NULL;
2294 if (unlikely(par_blkno == 0)) {
2295 XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
2296 XFS_ERRLEVEL_LOW, mp);
2297 error = -EFSCORRUPTED;
2298 goto done;
2299 }
2300 error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
2301 if (error)
2302 goto done;
2303 par_node = par_buf->b_addr;
2304 dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
2305 if (par_hdr.level != level) {
2306 XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
2307 XFS_ERRLEVEL_LOW, mp);
2308 error = -EFSCORRUPTED;
2309 goto done;
2310 }
2311 btree = dp->d_ops->node_tree_p(par_node);
2312 entno = 0;
2313 }
2314 /*
2315 * Update the parent entry pointing to the moved block.
2316 */
2317 btree[entno].before = cpu_to_be32(dead_blkno);
2318 xfs_trans_log_buf(tp, par_buf,
2319 XFS_DA_LOGRANGE(par_node, &btree[entno].before,
2320 sizeof(btree[entno].before)));
2321 *dead_blknop = last_blkno;
2322 *dead_bufp = last_buf;
2323 return 0;
2324done:
2325 if (par_buf)
2326 xfs_trans_brelse(tp, par_buf);
2327 if (sib_buf)
2328 xfs_trans_brelse(tp, sib_buf);
2329 xfs_trans_brelse(tp, last_buf);
2330 return error;
2331}
2332
2333/*
2334 * Remove a btree block from a directory or attribute.
2335 */
2336int
2337xfs_da_shrink_inode(
2338 xfs_da_args_t *args,
2339 xfs_dablk_t dead_blkno,
2340 struct xfs_buf *dead_buf)
2341{
2342 xfs_inode_t *dp;
2343 int done, error, w, count;
2344 xfs_trans_t *tp;
2345 xfs_mount_t *mp;
2346
2347 trace_xfs_da_shrink_inode(args);
2348
2349 dp = args->dp;
2350 w = args->whichfork;
2351 tp = args->trans;
2352 mp = dp->i_mount;
2353 count = args->geo->fsbcount;
2354 for (;;) {
2355 /*
2356 * Remove extents. If we get ENOSPC for a dir we have to move
2357 * the last block to the place we want to kill.
2358 */
2359 error = xfs_bunmapi(tp, dp, dead_blkno, count,
2360 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
2361 0, args->firstblock, args->flist, &done);
2362 if (error == -ENOSPC) {
2363 if (w != XFS_DATA_FORK)
2364 break;
2365 error = xfs_da3_swap_lastblock(args, &dead_blkno,
2366 &dead_buf);
2367 if (error)
2368 break;
2369 } else {
2370 break;
2371 }
2372 }
2373 xfs_trans_binval(tp, dead_buf);
2374 return error;
2375}
2376
2377/*
2378 * See if the mapping(s) for this btree block are valid, i.e.
2379 * don't contain holes, are logically contiguous, and cover the whole range.
2380 */
2381STATIC int
2382xfs_da_map_covers_blocks(
2383 int nmap,
2384 xfs_bmbt_irec_t *mapp,
2385 xfs_dablk_t bno,
2386 int count)
2387{
2388 int i;
2389 xfs_fileoff_t off;
2390
2391 for (i = 0, off = bno; i < nmap; i++) {
2392 if (mapp[i].br_startblock == HOLESTARTBLOCK ||
2393 mapp[i].br_startblock == DELAYSTARTBLOCK) {
2394 return 0;
2395 }
2396 if (off != mapp[i].br_startoff) {
2397 return 0;
2398 }
2399 off += mapp[i].br_blockcount;
2400 }
2401 return off == bno + count;
2402}
2403
2404/*
2405 * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
2406 *
2407 * For the single map case, it is assumed that the caller has provided a pointer
2408 * to a valid xfs_buf_map. For the multiple map case, this function will
2409 * allocate the xfs_buf_map to hold all the maps and replace the caller's single
2410 * map pointer with the allocated map.
2411 */
2412static int
2413xfs_buf_map_from_irec(
2414 struct xfs_mount *mp,
2415 struct xfs_buf_map **mapp,
2416 int *nmaps,
2417 struct xfs_bmbt_irec *irecs,
2418 int nirecs)
2419{
2420 struct xfs_buf_map *map;
2421 int i;
2422
2423 ASSERT(*nmaps == 1);
2424 ASSERT(nirecs >= 1);
2425
2426 if (nirecs > 1) {
2427 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
2428 KM_SLEEP | KM_NOFS);
2429 if (!map)
2430 return -ENOMEM;
2431 *mapp = map;
2432 }
2433
2434 *nmaps = nirecs;
2435 map = *mapp;
2436 for (i = 0; i < *nmaps; i++) {
2437 ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
2438 irecs[i].br_startblock != HOLESTARTBLOCK);
2439 map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
2440 map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
2441 }
2442 return 0;
2443}
2444
2445/*
2446 * Map the block we are given ready for reading. There are three possible return
2447 * values:
2448 * -1 - will be returned if we land in a hole and mappedbno == -2 so the
2449 * caller knows not to execute a subsequent read.
2450 * 0 - if we mapped the block successfully
2451 * >0 - positive error number if there was an error.
2452 */
2453static int
2454xfs_dabuf_map(
2455 struct xfs_inode *dp,
2456 xfs_dablk_t bno,
2457 xfs_daddr_t mappedbno,
2458 int whichfork,
2459 struct xfs_buf_map **map,
2460 int *nmaps)
2461{
2462 struct xfs_mount *mp = dp->i_mount;
2463 int nfsb;
2464 int error = 0;
2465 struct xfs_bmbt_irec irec;
2466 struct xfs_bmbt_irec *irecs = &irec;
2467 int nirecs;
2468
2469 ASSERT(map && *map);
2470 ASSERT(*nmaps == 1);
2471
2472 if (whichfork == XFS_DATA_FORK)
2473 nfsb = mp->m_dir_geo->fsbcount;
2474 else
2475 nfsb = mp->m_attr_geo->fsbcount;
2476
2477 /*
2478 * Caller doesn't have a mapping. -2 means don't complain
2479 * if we land in a hole.
2480 */
2481 if (mappedbno == -1 || mappedbno == -2) {
2482 /*
2483 * Optimize the one-block case.
2484 */
2485 if (nfsb != 1)
2486 irecs = kmem_zalloc(sizeof(irec) * nfsb,
2487 KM_SLEEP | KM_NOFS);
2488
2489 nirecs = nfsb;
2490 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
2491 &nirecs, xfs_bmapi_aflag(whichfork));
2492 if (error)
2493 goto out;
2494 } else {
2495 irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
2496 irecs->br_startoff = (xfs_fileoff_t)bno;
2497 irecs->br_blockcount = nfsb;
2498 irecs->br_state = 0;
2499 nirecs = 1;
2500 }
2501
2502 if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
2503 error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
2504 if (unlikely(error == -EFSCORRUPTED)) {
2505 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
2506 int i;
2507 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
2508 __func__, (long long)bno,
2509 (long long)dp->i_ino);
2510 for (i = 0; i < *nmaps; i++) {
2511 xfs_alert(mp,
2512"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
2513 i,
2514 (long long)irecs[i].br_startoff,
2515 (long long)irecs[i].br_startblock,
2516 (long long)irecs[i].br_blockcount,
2517 irecs[i].br_state);
2518 }
2519 }
2520 XFS_ERROR_REPORT("xfs_da_do_buf(1)",
2521 XFS_ERRLEVEL_LOW, mp);
2522 }
2523 goto out;
2524 }
2525 error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
2526out:
2527 if (irecs != &irec)
2528 kmem_free(irecs);
2529 return error;
2530}
2531
2532/*
2533 * Get a buffer for the dir/attr block.
2534 */
2535int
2536xfs_da_get_buf(
2537 struct xfs_trans *trans,
2538 struct xfs_inode *dp,
2539 xfs_dablk_t bno,
2540 xfs_daddr_t mappedbno,
2541 struct xfs_buf **bpp,
2542 int whichfork)
2543{
2544 struct xfs_buf *bp;
2545 struct xfs_buf_map map;
2546 struct xfs_buf_map *mapp;
2547 int nmap;
2548 int error;
2549
2550 *bpp = NULL;
2551 mapp = &map;
2552 nmap = 1;
2553 error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
2554 &mapp, &nmap);
2555 if (error) {
2556 /* mapping a hole is not an error, but we don't continue */
2557 if (error == -1)
2558 error = 0;
2559 goto out_free;
2560 }
2561
2562 bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
2563 mapp, nmap, 0);
2564 error = bp ? bp->b_error : -EIO;
2565 if (error) {
2566 xfs_trans_brelse(trans, bp);
2567 goto out_free;
2568 }
2569
2570 *bpp = bp;
2571
2572out_free:
2573 if (mapp != &map)
2574 kmem_free(mapp);
2575
2576 return error;
2577}
2578
2579/*
2580 * Get a buffer for the dir/attr block, fill in the contents.
2581 */
2582int
2583xfs_da_read_buf(
2584 struct xfs_trans *trans,
2585 struct xfs_inode *dp,
2586 xfs_dablk_t bno,
2587 xfs_daddr_t mappedbno,
2588 struct xfs_buf **bpp,
2589 int whichfork,
2590 const struct xfs_buf_ops *ops)
2591{
2592 struct xfs_buf *bp;
2593 struct xfs_buf_map map;
2594 struct xfs_buf_map *mapp;
2595 int nmap;
2596 int error;
2597
2598 *bpp = NULL;
2599 mapp = &map;
2600 nmap = 1;
2601 error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
2602 &mapp, &nmap);
2603 if (error) {
2604 /* mapping a hole is not an error, but we don't continue */
2605 if (error == -1)
2606 error = 0;
2607 goto out_free;
2608 }
2609
2610 error = xfs_trans_read_buf_map(dp->i_mount, trans,
2611 dp->i_mount->m_ddev_targp,
2612 mapp, nmap, 0, &bp, ops);
2613 if (error)
2614 goto out_free;
2615
2616 if (whichfork == XFS_ATTR_FORK)
2617 xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
2618 else
2619 xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
2620 *bpp = bp;
2621out_free:
2622 if (mapp != &map)
2623 kmem_free(mapp);
2624
2625 return error;
2626}
2627
2628/*
2629 * Readahead the dir/attr block.
2630 */
2631xfs_daddr_t
2632xfs_da_reada_buf(
2633 struct xfs_inode *dp,
2634 xfs_dablk_t bno,
2635 xfs_daddr_t mappedbno,
2636 int whichfork,
2637 const struct xfs_buf_ops *ops)
2638{
2639 struct xfs_buf_map map;
2640 struct xfs_buf_map *mapp;
2641 int nmap;
2642 int error;
2643
2644 mapp = &map;
2645 nmap = 1;
2646 error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
2647 &mapp, &nmap);
2648 if (error) {
2649 /* mapping a hole is not an error, but we don't continue */
2650 if (error == -1)
2651 error = 0;
2652 goto out_free;
2653 }
2654
2655 mappedbno = mapp[0].bm_bn;
2656 xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
2657
2658out_free:
2659 if (mapp != &map)
2660 kmem_free(mapp);
2661
2662 if (error)
2663 return -1;
2664 return mappedbno;
2665}
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
new file mode 100644
index 000000000000..6e153e399a77
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -0,0 +1,221 @@
1/*
2 * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#ifndef __XFS_DA_BTREE_H__
20#define __XFS_DA_BTREE_H__
21
22struct xfs_bmap_free;
23struct xfs_inode;
24struct xfs_trans;
25struct zone;
26struct xfs_dir_ops;
27
28/*
29 * Directory/attribute geometry information. There will be one of these for each
30 * data fork type, and it will be passed around via the xfs_da_args. Global
31 * structures will be attached to the xfs_mount.
32 */
33struct xfs_da_geometry {
34 int blksize; /* da block size in bytes */
35 int fsbcount; /* da block size in filesystem blocks */
36 uint8_t fsblog; /* log2 of _filesystem_ block size */
37 uint8_t blklog; /* log2 of da block size */
38 uint node_ents; /* # of entries in a danode */
39 int magicpct; /* 37% of block size in bytes */
40 xfs_dablk_t datablk; /* blockno of dir data v2 */
41 xfs_dablk_t leafblk; /* blockno of leaf data v2 */
42 xfs_dablk_t freeblk; /* blockno of free data v2 */
43};
44
45/*========================================================================
46 * Btree searching and modification structure definitions.
47 *========================================================================*/
48
49/*
50 * Search comparison results
51 */
52enum xfs_dacmp {
53 XFS_CMP_DIFFERENT, /* names are completely different */
54 XFS_CMP_EXACT, /* names are exactly the same */
55 XFS_CMP_CASE /* names are same but differ in case */
56};
57
58/*
59 * Structure to ease passing around component names.
60 */
61typedef struct xfs_da_args {
62 struct xfs_da_geometry *geo; /* da block geometry */
63 const __uint8_t *name; /* string (maybe not NULL terminated) */
64 int namelen; /* length of string (maybe no NULL) */
65 __uint8_t filetype; /* filetype of inode for directories */
66 __uint8_t *value; /* set of bytes (maybe contain NULLs) */
67 int valuelen; /* length of value */
68 int flags; /* argument flags (eg: ATTR_NOCREATE) */
69 xfs_dahash_t hashval; /* hash value of name */
70 xfs_ino_t inumber; /* input/output inode number */
71 struct xfs_inode *dp; /* directory inode to manipulate */
72 xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */
73 struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */
74 struct xfs_trans *trans; /* current trans (changes over time) */
75 xfs_extlen_t total; /* total blocks needed, for 1st bmap */
76 int whichfork; /* data or attribute fork */
77 xfs_dablk_t blkno; /* blkno of attr leaf of interest */
78 int index; /* index of attr of interest in blk */
79 xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
80 int rmtblkcnt; /* remote attr value block count */
81 int rmtvaluelen; /* remote attr value length in bytes */
82 xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
83 int index2; /* index of 2nd attr in blk */
84 xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
85 int rmtblkcnt2; /* remote attr value block count */
86 int rmtvaluelen2; /* remote attr value length in bytes */
87 int op_flags; /* operation flags */
88 enum xfs_dacmp cmpresult; /* name compare result for lookups */
89} xfs_da_args_t;
90
91/*
92 * Operation flags:
93 */
94#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
95#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
96#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
97#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
98#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
99
100#define XFS_DA_OP_FLAGS \
101 { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
102 { XFS_DA_OP_RENAME, "RENAME" }, \
103 { XFS_DA_OP_ADDNAME, "ADDNAME" }, \
104 { XFS_DA_OP_OKNOENT, "OKNOENT" }, \
105 { XFS_DA_OP_CILOOKUP, "CILOOKUP" }
106
107/*
108 * Storage for holding state during Btree searches and split/join ops.
109 *
110 * Only need space for 5 intermediate nodes. With a minimum of 62-way
111 * fanout to the Btree, we can support over 900 million directory blocks,
112 * which is slightly more than enough.
113 */
114typedef struct xfs_da_state_blk {
115 struct xfs_buf *bp; /* buffer containing block */
116 xfs_dablk_t blkno; /* filesystem blkno of buffer */
117 xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */
118 int index; /* relevant index into block */
119 xfs_dahash_t hashval; /* last hash value in block */
120 int magic; /* blk's magic number, ie: blk type */
121} xfs_da_state_blk_t;
122
123typedef struct xfs_da_state_path {
124 int active; /* number of active levels */
125 xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH];
126} xfs_da_state_path_t;
127
128typedef struct xfs_da_state {
129 xfs_da_args_t *args; /* filename arguments */
130 struct xfs_mount *mp; /* filesystem mount point */
131 xfs_da_state_path_t path; /* search/split paths */
132 xfs_da_state_path_t altpath; /* alternate path for join */
133 unsigned char inleaf; /* insert into 1->lf, 0->splf */
134 unsigned char extravalid; /* T/F: extrablk is in use */
135 unsigned char extraafter; /* T/F: extrablk is after new */
136 xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
137 /* for dirv2 extrablk is data */
138} xfs_da_state_t;
139
140/*
141 * Utility macros to aid in logging changed structure fields.
142 */
143#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
144#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \
145 (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
146 (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
147
148/*
149 * Name ops for directory and/or attr name operations
150 */
151struct xfs_nameops {
152 xfs_dahash_t (*hashname)(struct xfs_name *);
153 enum xfs_dacmp (*compname)(struct xfs_da_args *,
154 const unsigned char *, int);
155};
156
157
158/*========================================================================
159 * Function prototypes.
160 *========================================================================*/
161
162/*
163 * Routines used for growing the Btree.
164 */
165int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
166 int level, struct xfs_buf **bpp, int whichfork);
167int xfs_da3_split(xfs_da_state_t *state);
168
169/*
170 * Routines used for shrinking the Btree.
171 */
172int xfs_da3_join(xfs_da_state_t *state);
173void xfs_da3_fixhashpath(struct xfs_da_state *state,
174 struct xfs_da_state_path *path_to_to_fix);
175
176/*
177 * Routines used for finding things in the Btree.
178 */
179int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
180int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
181 int forward, int release, int *result);
182/*
183 * Utility routines.
184 */
185int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
186 xfs_da_state_blk_t *new_blk);
187int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
188 xfs_dablk_t bno, xfs_daddr_t mappedbno,
189 struct xfs_buf **bpp, int which_fork);
190
191/*
192 * Utility routines.
193 */
194int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
195int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
196 int count);
197int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
198 xfs_dablk_t bno, xfs_daddr_t mappedbno,
199 struct xfs_buf **bp, int whichfork);
200int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
201 xfs_dablk_t bno, xfs_daddr_t mappedbno,
202 struct xfs_buf **bpp, int whichfork,
203 const struct xfs_buf_ops *ops);
204xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
205 xfs_daddr_t mapped_bno, int whichfork,
206 const struct xfs_buf_ops *ops);
207int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
208 struct xfs_buf *dead_buf);
209
210uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
211enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
212 const unsigned char *name, int len);
213
214
215xfs_da_state_t *xfs_da_state_alloc(void);
216void xfs_da_state_free(xfs_da_state_t *state);
217
218extern struct kmem_zone *xfs_da_state_zone;
219extern const struct xfs_nameops xfs_default_nameops;
220
221#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
new file mode 100644
index 000000000000..c9aee52a37e2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -0,0 +1,911 @@
1/*
2 * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_shared.h"
22#include "xfs_format.h"
23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_da_format.h"
29#include "xfs_da_btree.h"
30#include "xfs_inode.h"
31#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h"
33
34/*
35 * Shortform directory ops
36 */
37static int
38xfs_dir2_sf_entsize(
39 struct xfs_dir2_sf_hdr *hdr,
40 int len)
41{
42 int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
43
44 count += len; /* name */
45 count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
46 sizeof(xfs_dir2_ino4_t); /* ino # */
47 return count;
48}
49
50static int
51xfs_dir3_sf_entsize(
52 struct xfs_dir2_sf_hdr *hdr,
53 int len)
54{
55 return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
56}
57
58static struct xfs_dir2_sf_entry *
59xfs_dir2_sf_nextentry(
60 struct xfs_dir2_sf_hdr *hdr,
61 struct xfs_dir2_sf_entry *sfep)
62{
63 return (struct xfs_dir2_sf_entry *)
64 ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
65}
66
67static struct xfs_dir2_sf_entry *
68xfs_dir3_sf_nextentry(
69 struct xfs_dir2_sf_hdr *hdr,
70 struct xfs_dir2_sf_entry *sfep)
71{
72 return (struct xfs_dir2_sf_entry *)
73 ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
74}
75
76
77/*
78 * For filetype enabled shortform directories, the file type field is stored at
79 * the end of the name. Because it's only a single byte, endian conversion is
80 * not necessary. For non-filetype enable directories, the type is always
81 * unknown and we never store the value.
82 */
83static __uint8_t
84xfs_dir2_sfe_get_ftype(
85 struct xfs_dir2_sf_entry *sfep)
86{
87 return XFS_DIR3_FT_UNKNOWN;
88}
89
90static void
91xfs_dir2_sfe_put_ftype(
92 struct xfs_dir2_sf_entry *sfep,
93 __uint8_t ftype)
94{
95 ASSERT(ftype < XFS_DIR3_FT_MAX);
96}
97
98static __uint8_t
99xfs_dir3_sfe_get_ftype(
100 struct xfs_dir2_sf_entry *sfep)
101{
102 __uint8_t ftype;
103
104 ftype = sfep->name[sfep->namelen];
105 if (ftype >= XFS_DIR3_FT_MAX)
106 return XFS_DIR3_FT_UNKNOWN;
107 return ftype;
108}
109
110static void
111xfs_dir3_sfe_put_ftype(
112 struct xfs_dir2_sf_entry *sfep,
113 __uint8_t ftype)
114{
115 ASSERT(ftype < XFS_DIR3_FT_MAX);
116
117 sfep->name[sfep->namelen] = ftype;
118}
119
120/*
121 * Inode numbers in short-form directories can come in two versions,
122 * either 4 bytes or 8 bytes wide. These helpers deal with the
123 * two forms transparently by looking at the headers i8count field.
124 *
125 * For 64-bit inode number the most significant byte must be zero.
126 */
127static xfs_ino_t
128xfs_dir2_sf_get_ino(
129 struct xfs_dir2_sf_hdr *hdr,
130 xfs_dir2_inou_t *from)
131{
132 if (hdr->i8count)
133 return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
134 else
135 return get_unaligned_be32(&from->i4.i);
136}
137
138static void
139xfs_dir2_sf_put_ino(
140 struct xfs_dir2_sf_hdr *hdr,
141 xfs_dir2_inou_t *to,
142 xfs_ino_t ino)
143{
144 ASSERT((ino & 0xff00000000000000ULL) == 0);
145
146 if (hdr->i8count)
147 put_unaligned_be64(ino, &to->i8.i);
148 else
149 put_unaligned_be32(ino, &to->i4.i);
150}
151
152static xfs_ino_t
153xfs_dir2_sf_get_parent_ino(
154 struct xfs_dir2_sf_hdr *hdr)
155{
156 return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
157}
158
159static void
160xfs_dir2_sf_put_parent_ino(
161 struct xfs_dir2_sf_hdr *hdr,
162 xfs_ino_t ino)
163{
164 xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
165}
166
167/*
168 * In short-form directory entries the inode numbers are stored at variable
169 * offset behind the entry name. If the entry stores a filetype value, then it
170 * sits between the name and the inode number. Hence the inode numbers may only
171 * be accessed through the helpers below.
172 */
173static xfs_ino_t
174xfs_dir2_sfe_get_ino(
175 struct xfs_dir2_sf_hdr *hdr,
176 struct xfs_dir2_sf_entry *sfep)
177{
178 return xfs_dir2_sf_get_ino(hdr,
179 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
180}
181
182static void
183xfs_dir2_sfe_put_ino(
184 struct xfs_dir2_sf_hdr *hdr,
185 struct xfs_dir2_sf_entry *sfep,
186 xfs_ino_t ino)
187{
188 xfs_dir2_sf_put_ino(hdr,
189 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
190}
191
192static xfs_ino_t
193xfs_dir3_sfe_get_ino(
194 struct xfs_dir2_sf_hdr *hdr,
195 struct xfs_dir2_sf_entry *sfep)
196{
197 return xfs_dir2_sf_get_ino(hdr,
198 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
199}
200
201static void
202xfs_dir3_sfe_put_ino(
203 struct xfs_dir2_sf_hdr *hdr,
204 struct xfs_dir2_sf_entry *sfep,
205 xfs_ino_t ino)
206{
207 xfs_dir2_sf_put_ino(hdr,
208 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
209}
210
211
212/*
213 * Directory data block operations
214 */
215
216/*
217 * For special situations, the dirent size ends up fixed because we always know
218 * what the size of the entry is. That's true for the "." and "..", and
219 * therefore we know that they are a fixed size and hence their offsets are
220 * constant, as is the first entry.
221 *
222 * Hence, this calculation is written as a macro to be able to be calculated at
223 * compile time and so certain offsets can be calculated directly in the
224 * structure initaliser via the macro. There are two macros - one for dirents
225 * with ftype and without so there are no unresolvable conditionals in the
226 * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
227 * of 2 and the compiler doesn't reject it (unlike roundup()).
228 */
229#define XFS_DIR2_DATA_ENTSIZE(n) \
230 round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
231 sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
232
233#define XFS_DIR3_DATA_ENTSIZE(n) \
234 round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
235 sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \
236 XFS_DIR2_DATA_ALIGN)
237
238static int
239xfs_dir2_data_entsize(
240 int n)
241{
242 return XFS_DIR2_DATA_ENTSIZE(n);
243}
244
245static int
246xfs_dir3_data_entsize(
247 int n)
248{
249 return XFS_DIR3_DATA_ENTSIZE(n);
250}
251
252static __uint8_t
253xfs_dir2_data_get_ftype(
254 struct xfs_dir2_data_entry *dep)
255{
256 return XFS_DIR3_FT_UNKNOWN;
257}
258
259static void
260xfs_dir2_data_put_ftype(
261 struct xfs_dir2_data_entry *dep,
262 __uint8_t ftype)
263{
264 ASSERT(ftype < XFS_DIR3_FT_MAX);
265}
266
267static __uint8_t
268xfs_dir3_data_get_ftype(
269 struct xfs_dir2_data_entry *dep)
270{
271 __uint8_t ftype = dep->name[dep->namelen];
272
273 ASSERT(ftype < XFS_DIR3_FT_MAX);
274 if (ftype >= XFS_DIR3_FT_MAX)
275 return XFS_DIR3_FT_UNKNOWN;
276 return ftype;
277}
278
279static void
280xfs_dir3_data_put_ftype(
281 struct xfs_dir2_data_entry *dep,
282 __uint8_t type)
283{
284 ASSERT(type < XFS_DIR3_FT_MAX);
285 ASSERT(dep->namelen != 0);
286
287 dep->name[dep->namelen] = type;
288}
289
290/*
291 * Pointer to an entry's tag word.
292 */
293static __be16 *
294xfs_dir2_data_entry_tag_p(
295 struct xfs_dir2_data_entry *dep)
296{
297 return (__be16 *)((char *)dep +
298 xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
299}
300
301static __be16 *
302xfs_dir3_data_entry_tag_p(
303 struct xfs_dir2_data_entry *dep)
304{
305 return (__be16 *)((char *)dep +
306 xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
307}
308
309/*
310 * location of . and .. in data space (always block 0)
311 */
312static struct xfs_dir2_data_entry *
313xfs_dir2_data_dot_entry_p(
314 struct xfs_dir2_data_hdr *hdr)
315{
316 return (struct xfs_dir2_data_entry *)
317 ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
318}
319
320static struct xfs_dir2_data_entry *
321xfs_dir2_data_dotdot_entry_p(
322 struct xfs_dir2_data_hdr *hdr)
323{
324 return (struct xfs_dir2_data_entry *)
325 ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
326 XFS_DIR2_DATA_ENTSIZE(1));
327}
328
329static struct xfs_dir2_data_entry *
330xfs_dir2_data_first_entry_p(
331 struct xfs_dir2_data_hdr *hdr)
332{
333 return (struct xfs_dir2_data_entry *)
334 ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
335 XFS_DIR2_DATA_ENTSIZE(1) +
336 XFS_DIR2_DATA_ENTSIZE(2));
337}
338
339static struct xfs_dir2_data_entry *
340xfs_dir2_ftype_data_dotdot_entry_p(
341 struct xfs_dir2_data_hdr *hdr)
342{
343 return (struct xfs_dir2_data_entry *)
344 ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
345 XFS_DIR3_DATA_ENTSIZE(1));
346}
347
348static struct xfs_dir2_data_entry *
349xfs_dir2_ftype_data_first_entry_p(
350 struct xfs_dir2_data_hdr *hdr)
351{
352 return (struct xfs_dir2_data_entry *)
353 ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
354 XFS_DIR3_DATA_ENTSIZE(1) +
355 XFS_DIR3_DATA_ENTSIZE(2));
356}
357
358static struct xfs_dir2_data_entry *
359xfs_dir3_data_dot_entry_p(
360 struct xfs_dir2_data_hdr *hdr)
361{
362 return (struct xfs_dir2_data_entry *)
363 ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
364}
365
366static struct xfs_dir2_data_entry *
367xfs_dir3_data_dotdot_entry_p(
368 struct xfs_dir2_data_hdr *hdr)
369{
370 return (struct xfs_dir2_data_entry *)
371 ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
372 XFS_DIR3_DATA_ENTSIZE(1));
373}
374
375static struct xfs_dir2_data_entry *
376xfs_dir3_data_first_entry_p(
377 struct xfs_dir2_data_hdr *hdr)
378{
379 return (struct xfs_dir2_data_entry *)
380 ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
381 XFS_DIR3_DATA_ENTSIZE(1) +
382 XFS_DIR3_DATA_ENTSIZE(2));
383}
384
385static struct xfs_dir2_data_free *
386xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
387{
388 return hdr->bestfree;
389}
390
391static struct xfs_dir2_data_free *
392xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
393{
394 return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
395}
396
397static struct xfs_dir2_data_entry *
398xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
399{
400 return (struct xfs_dir2_data_entry *)
401 ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
402}
403
404static struct xfs_dir2_data_unused *
405xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
406{
407 return (struct xfs_dir2_data_unused *)
408 ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
409}
410
411static struct xfs_dir2_data_entry *
412xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
413{
414 return (struct xfs_dir2_data_entry *)
415 ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
416}
417
418static struct xfs_dir2_data_unused *
419xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
420{
421 return (struct xfs_dir2_data_unused *)
422 ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
423}
424
425
426/*
427 * Directory Leaf block operations
428 */
429static int
430xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
431{
432 return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
433 (uint)sizeof(struct xfs_dir2_leaf_entry);
434}
435
436static struct xfs_dir2_leaf_entry *
437xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
438{
439 return lp->__ents;
440}
441
442static int
443xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
444{
445 return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
446 (uint)sizeof(struct xfs_dir2_leaf_entry);
447}
448
449static struct xfs_dir2_leaf_entry *
450xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
451{
452 return ((struct xfs_dir3_leaf *)lp)->__ents;
453}
454
455static void
456xfs_dir2_leaf_hdr_from_disk(
457 struct xfs_dir3_icleaf_hdr *to,
458 struct xfs_dir2_leaf *from)
459{
460 to->forw = be32_to_cpu(from->hdr.info.forw);
461 to->back = be32_to_cpu(from->hdr.info.back);
462 to->magic = be16_to_cpu(from->hdr.info.magic);
463 to->count = be16_to_cpu(from->hdr.count);
464 to->stale = be16_to_cpu(from->hdr.stale);
465
466 ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
467 to->magic == XFS_DIR2_LEAFN_MAGIC);
468}
469
470static void
471xfs_dir2_leaf_hdr_to_disk(
472 struct xfs_dir2_leaf *to,
473 struct xfs_dir3_icleaf_hdr *from)
474{
475 ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
476 from->magic == XFS_DIR2_LEAFN_MAGIC);
477
478 to->hdr.info.forw = cpu_to_be32(from->forw);
479 to->hdr.info.back = cpu_to_be32(from->back);
480 to->hdr.info.magic = cpu_to_be16(from->magic);
481 to->hdr.count = cpu_to_be16(from->count);
482 to->hdr.stale = cpu_to_be16(from->stale);
483}
484
485static void
486xfs_dir3_leaf_hdr_from_disk(
487 struct xfs_dir3_icleaf_hdr *to,
488 struct xfs_dir2_leaf *from)
489{
490 struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
491
492 to->forw = be32_to_cpu(hdr3->info.hdr.forw);
493 to->back = be32_to_cpu(hdr3->info.hdr.back);
494 to->magic = be16_to_cpu(hdr3->info.hdr.magic);
495 to->count = be16_to_cpu(hdr3->count);
496 to->stale = be16_to_cpu(hdr3->stale);
497
498 ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
499 to->magic == XFS_DIR3_LEAFN_MAGIC);
500}
501
502static void
503xfs_dir3_leaf_hdr_to_disk(
504 struct xfs_dir2_leaf *to,
505 struct xfs_dir3_icleaf_hdr *from)
506{
507 struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
508
509 ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
510 from->magic == XFS_DIR3_LEAFN_MAGIC);
511
512 hdr3->info.hdr.forw = cpu_to_be32(from->forw);
513 hdr3->info.hdr.back = cpu_to_be32(from->back);
514 hdr3->info.hdr.magic = cpu_to_be16(from->magic);
515 hdr3->count = cpu_to_be16(from->count);
516 hdr3->stale = cpu_to_be16(from->stale);
517}
518
519
520/*
521 * Directory/Attribute Node block operations
522 */
523static struct xfs_da_node_entry *
524xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
525{
526 return dap->__btree;
527}
528
529static struct xfs_da_node_entry *
530xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
531{
532 return ((struct xfs_da3_intnode *)dap)->__btree;
533}
534
535static void
536xfs_da2_node_hdr_from_disk(
537 struct xfs_da3_icnode_hdr *to,
538 struct xfs_da_intnode *from)
539{
540 ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
541 to->forw = be32_to_cpu(from->hdr.info.forw);
542 to->back = be32_to_cpu(from->hdr.info.back);
543 to->magic = be16_to_cpu(from->hdr.info.magic);
544 to->count = be16_to_cpu(from->hdr.__count);
545 to->level = be16_to_cpu(from->hdr.__level);
546}
547
548static void
549xfs_da2_node_hdr_to_disk(
550 struct xfs_da_intnode *to,
551 struct xfs_da3_icnode_hdr *from)
552{
553 ASSERT(from->magic == XFS_DA_NODE_MAGIC);
554 to->hdr.info.forw = cpu_to_be32(from->forw);
555 to->hdr.info.back = cpu_to_be32(from->back);
556 to->hdr.info.magic = cpu_to_be16(from->magic);
557 to->hdr.__count = cpu_to_be16(from->count);
558 to->hdr.__level = cpu_to_be16(from->level);
559}
560
561static void
562xfs_da3_node_hdr_from_disk(
563 struct xfs_da3_icnode_hdr *to,
564 struct xfs_da_intnode *from)
565{
566 struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
567
568 ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
569 to->forw = be32_to_cpu(hdr3->info.hdr.forw);
570 to->back = be32_to_cpu(hdr3->info.hdr.back);
571 to->magic = be16_to_cpu(hdr3->info.hdr.magic);
572 to->count = be16_to_cpu(hdr3->__count);
573 to->level = be16_to_cpu(hdr3->__level);
574}
575
576static void
577xfs_da3_node_hdr_to_disk(
578 struct xfs_da_intnode *to,
579 struct xfs_da3_icnode_hdr *from)
580{
581 struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
582
583 ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
584 hdr3->info.hdr.forw = cpu_to_be32(from->forw);
585 hdr3->info.hdr.back = cpu_to_be32(from->back);
586 hdr3->info.hdr.magic = cpu_to_be16(from->magic);
587 hdr3->__count = cpu_to_be16(from->count);
588 hdr3->__level = cpu_to_be16(from->level);
589}
590
591
592/*
593 * Directory free space block operations
594 */
595static int
596xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
597{
598 return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
599 sizeof(xfs_dir2_data_off_t);
600}
601
602static __be16 *
603xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
604{
605 return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
606}
607
608/*
609 * Convert data space db to the corresponding free db.
610 */
611static xfs_dir2_db_t
612xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
613{
614 return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
615 (db / xfs_dir2_free_max_bests(geo));
616}
617
618/*
619 * Convert data space db to the corresponding index in a free db.
620 */
621static int
622xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
623{
624 return db % xfs_dir2_free_max_bests(geo);
625}
626
627static int
628xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
629{
630 return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
631 sizeof(xfs_dir2_data_off_t);
632}
633
634static __be16 *
635xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
636{
637 return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
638}
639
640/*
641 * Convert data space db to the corresponding free db.
642 */
643static xfs_dir2_db_t
644xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
645{
646 return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
647 (db / xfs_dir3_free_max_bests(geo));
648}
649
650/*
651 * Convert data space db to the corresponding index in a free db.
652 */
653static int
654xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
655{
656 return db % xfs_dir3_free_max_bests(geo);
657}
658
659static void
660xfs_dir2_free_hdr_from_disk(
661 struct xfs_dir3_icfree_hdr *to,
662 struct xfs_dir2_free *from)
663{
664 to->magic = be32_to_cpu(from->hdr.magic);
665 to->firstdb = be32_to_cpu(from->hdr.firstdb);
666 to->nvalid = be32_to_cpu(from->hdr.nvalid);
667 to->nused = be32_to_cpu(from->hdr.nused);
668 ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
669}
670
671static void
672xfs_dir2_free_hdr_to_disk(
673 struct xfs_dir2_free *to,
674 struct xfs_dir3_icfree_hdr *from)
675{
676 ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
677
678 to->hdr.magic = cpu_to_be32(from->magic);
679 to->hdr.firstdb = cpu_to_be32(from->firstdb);
680 to->hdr.nvalid = cpu_to_be32(from->nvalid);
681 to->hdr.nused = cpu_to_be32(from->nused);
682}
683
684static void
685xfs_dir3_free_hdr_from_disk(
686 struct xfs_dir3_icfree_hdr *to,
687 struct xfs_dir2_free *from)
688{
689 struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
690
691 to->magic = be32_to_cpu(hdr3->hdr.magic);
692 to->firstdb = be32_to_cpu(hdr3->firstdb);
693 to->nvalid = be32_to_cpu(hdr3->nvalid);
694 to->nused = be32_to_cpu(hdr3->nused);
695
696 ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
697}
698
699static void
700xfs_dir3_free_hdr_to_disk(
701 struct xfs_dir2_free *to,
702 struct xfs_dir3_icfree_hdr *from)
703{
704 struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
705
706 ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
707
708 hdr3->hdr.magic = cpu_to_be32(from->magic);
709 hdr3->firstdb = cpu_to_be32(from->firstdb);
710 hdr3->nvalid = cpu_to_be32(from->nvalid);
711 hdr3->nused = cpu_to_be32(from->nused);
712}
713
714static const struct xfs_dir_ops xfs_dir2_ops = {
715 .sf_entsize = xfs_dir2_sf_entsize,
716 .sf_nextentry = xfs_dir2_sf_nextentry,
717 .sf_get_ftype = xfs_dir2_sfe_get_ftype,
718 .sf_put_ftype = xfs_dir2_sfe_put_ftype,
719 .sf_get_ino = xfs_dir2_sfe_get_ino,
720 .sf_put_ino = xfs_dir2_sfe_put_ino,
721 .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
722 .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
723
724 .data_entsize = xfs_dir2_data_entsize,
725 .data_get_ftype = xfs_dir2_data_get_ftype,
726 .data_put_ftype = xfs_dir2_data_put_ftype,
727 .data_entry_tag_p = xfs_dir2_data_entry_tag_p,
728 .data_bestfree_p = xfs_dir2_data_bestfree_p,
729
730 .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
731 .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
732 XFS_DIR2_DATA_ENTSIZE(1),
733 .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
734 XFS_DIR2_DATA_ENTSIZE(1) +
735 XFS_DIR2_DATA_ENTSIZE(2),
736 .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
737
738 .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
739 .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
740 .data_first_entry_p = xfs_dir2_data_first_entry_p,
741 .data_entry_p = xfs_dir2_data_entry_p,
742 .data_unused_p = xfs_dir2_data_unused_p,
743
744 .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
745 .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
746 .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
747 .leaf_max_ents = xfs_dir2_max_leaf_ents,
748 .leaf_ents_p = xfs_dir2_leaf_ents_p,
749
750 .node_hdr_size = sizeof(struct xfs_da_node_hdr),
751 .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
752 .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
753 .node_tree_p = xfs_da2_node_tree_p,
754
755 .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
756 .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
757 .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
758 .free_max_bests = xfs_dir2_free_max_bests,
759 .free_bests_p = xfs_dir2_free_bests_p,
760 .db_to_fdb = xfs_dir2_db_to_fdb,
761 .db_to_fdindex = xfs_dir2_db_to_fdindex,
762};
763
764static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
765 .sf_entsize = xfs_dir3_sf_entsize,
766 .sf_nextentry = xfs_dir3_sf_nextentry,
767 .sf_get_ftype = xfs_dir3_sfe_get_ftype,
768 .sf_put_ftype = xfs_dir3_sfe_put_ftype,
769 .sf_get_ino = xfs_dir3_sfe_get_ino,
770 .sf_put_ino = xfs_dir3_sfe_put_ino,
771 .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
772 .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
773
774 .data_entsize = xfs_dir3_data_entsize,
775 .data_get_ftype = xfs_dir3_data_get_ftype,
776 .data_put_ftype = xfs_dir3_data_put_ftype,
777 .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
778 .data_bestfree_p = xfs_dir2_data_bestfree_p,
779
780 .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
781 .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
782 XFS_DIR3_DATA_ENTSIZE(1),
783 .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
784 XFS_DIR3_DATA_ENTSIZE(1) +
785 XFS_DIR3_DATA_ENTSIZE(2),
786 .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
787
788 .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
789 .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
790 .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
791 .data_entry_p = xfs_dir2_data_entry_p,
792 .data_unused_p = xfs_dir2_data_unused_p,
793
794 .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
795 .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
796 .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
797 .leaf_max_ents = xfs_dir2_max_leaf_ents,
798 .leaf_ents_p = xfs_dir2_leaf_ents_p,
799
800 .node_hdr_size = sizeof(struct xfs_da_node_hdr),
801 .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
802 .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
803 .node_tree_p = xfs_da2_node_tree_p,
804
805 .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
806 .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
807 .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
808 .free_max_bests = xfs_dir2_free_max_bests,
809 .free_bests_p = xfs_dir2_free_bests_p,
810 .db_to_fdb = xfs_dir2_db_to_fdb,
811 .db_to_fdindex = xfs_dir2_db_to_fdindex,
812};
813
814static const struct xfs_dir_ops xfs_dir3_ops = {
815 .sf_entsize = xfs_dir3_sf_entsize,
816 .sf_nextentry = xfs_dir3_sf_nextentry,
817 .sf_get_ftype = xfs_dir3_sfe_get_ftype,
818 .sf_put_ftype = xfs_dir3_sfe_put_ftype,
819 .sf_get_ino = xfs_dir3_sfe_get_ino,
820 .sf_put_ino = xfs_dir3_sfe_put_ino,
821 .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
822 .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
823
824 .data_entsize = xfs_dir3_data_entsize,
825 .data_get_ftype = xfs_dir3_data_get_ftype,
826 .data_put_ftype = xfs_dir3_data_put_ftype,
827 .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
828 .data_bestfree_p = xfs_dir3_data_bestfree_p,
829
830 .data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
831 .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
832 XFS_DIR3_DATA_ENTSIZE(1),
833 .data_first_offset = sizeof(struct xfs_dir3_data_hdr) +
834 XFS_DIR3_DATA_ENTSIZE(1) +
835 XFS_DIR3_DATA_ENTSIZE(2),
836 .data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
837
838 .data_dot_entry_p = xfs_dir3_data_dot_entry_p,
839 .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
840 .data_first_entry_p = xfs_dir3_data_first_entry_p,
841 .data_entry_p = xfs_dir3_data_entry_p,
842 .data_unused_p = xfs_dir3_data_unused_p,
843
844 .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
845 .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
846 .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
847 .leaf_max_ents = xfs_dir3_max_leaf_ents,
848 .leaf_ents_p = xfs_dir3_leaf_ents_p,
849
850 .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
851 .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
852 .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
853 .node_tree_p = xfs_da3_node_tree_p,
854
855 .free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
856 .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
857 .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
858 .free_max_bests = xfs_dir3_free_max_bests,
859 .free_bests_p = xfs_dir3_free_bests_p,
860 .db_to_fdb = xfs_dir3_db_to_fdb,
861 .db_to_fdindex = xfs_dir3_db_to_fdindex,
862};
863
864static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
865 .node_hdr_size = sizeof(struct xfs_da_node_hdr),
866 .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
867 .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
868 .node_tree_p = xfs_da2_node_tree_p,
869};
870
871static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
872 .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
873 .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
874 .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
875 .node_tree_p = xfs_da3_node_tree_p,
876};
877
878/*
879 * Return the ops structure according to the current config. If we are passed
880 * an inode, then that overrides the default config we use which is based on
881 * feature bits.
882 */
883const struct xfs_dir_ops *
884xfs_dir_get_ops(
885 struct xfs_mount *mp,
886 struct xfs_inode *dp)
887{
888 if (dp)
889 return dp->d_ops;
890 if (mp->m_dir_inode_ops)
891 return mp->m_dir_inode_ops;
892 if (xfs_sb_version_hascrc(&mp->m_sb))
893 return &xfs_dir3_ops;
894 if (xfs_sb_version_hasftype(&mp->m_sb))
895 return &xfs_dir2_ftype_ops;
896 return &xfs_dir2_ops;
897}
898
899const struct xfs_dir_ops *
900xfs_nondir_get_ops(
901 struct xfs_mount *mp,
902 struct xfs_inode *dp)
903{
904 if (dp)
905 return dp->d_ops;
906 if (mp->m_nondir_inode_ops)
907 return mp->m_nondir_inode_ops;
908 if (xfs_sb_version_hascrc(&mp->m_sb))
909 return &xfs_dir3_nondir_ops;
910 return &xfs_dir2_nondir_ops;
911}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
new file mode 100644
index 000000000000..0a49b0286372
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -0,0 +1,861 @@
1/*
2 * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#ifndef __XFS_DA_FORMAT_H__
20#define __XFS_DA_FORMAT_H__
21
22/*
23 * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
24 *
25 * It is used to manage a doubly linked list of all blocks at the same
26 * level in the Btree, and to identify which type of block this is.
27 */
28#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
29#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
30#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
31#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
32
33typedef struct xfs_da_blkinfo {
34 __be32 forw; /* previous block in list */
35 __be32 back; /* following block in list */
36 __be16 magic; /* validity check on block */
37 __be16 pad; /* unused */
38} xfs_da_blkinfo_t;
39
40/*
41 * CRC enabled directory structure types
42 *
43 * The headers change size for the additional verification information, but
44 * otherwise the tree layouts and contents are unchanged. Hence the da btree
45 * code can use the struct xfs_da_blkinfo for manipulating the tree links and
46 * magic numbers without modification for both v2 and v3 nodes.
47 */
48#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
49#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
50#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
51#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
52
53struct xfs_da3_blkinfo {
54 /*
55 * the node link manipulation code relies on the fact that the first
56 * element of this structure is the struct xfs_da_blkinfo so it can
57 * ignore the differences in the rest of the structures.
58 */
59 struct xfs_da_blkinfo hdr;
60 __be32 crc; /* CRC of block */
61 __be64 blkno; /* first block of the buffer */
62 __be64 lsn; /* sequence number of last write */
63 uuid_t uuid; /* filesystem we belong to */
64 __be64 owner; /* inode that owns the block */
65};
66
67/*
68 * This is the structure of the root and intermediate nodes in the Btree.
69 * The leaf nodes are defined above.
70 *
71 * Entries are not packed.
72 *
73 * Since we have duplicate keys, use a binary search but always follow
74 * all match in the block, not just the first match found.
75 */
76#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
77
78typedef struct xfs_da_node_hdr {
79 struct xfs_da_blkinfo info; /* block type, links, etc. */
80 __be16 __count; /* count of active entries */
81 __be16 __level; /* level above leaves (leaf == 0) */
82} xfs_da_node_hdr_t;
83
84struct xfs_da3_node_hdr {
85 struct xfs_da3_blkinfo info; /* block type, links, etc. */
86 __be16 __count; /* count of active entries */
87 __be16 __level; /* level above leaves (leaf == 0) */
88 __be32 __pad32;
89};
90
91#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc))
92
93typedef struct xfs_da_node_entry {
94 __be32 hashval; /* hash value for this descendant */
95 __be32 before; /* Btree block before this key */
96} xfs_da_node_entry_t;
97
98typedef struct xfs_da_intnode {
99 struct xfs_da_node_hdr hdr;
100 struct xfs_da_node_entry __btree[];
101} xfs_da_intnode_t;
102
103struct xfs_da3_intnode {
104 struct xfs_da3_node_hdr hdr;
105 struct xfs_da_node_entry __btree[];
106};
107
108/*
109 * In-core version of the node header to abstract the differences in the v2 and
110 * v3 disk format of the headers. Callers need to convert to/from disk format as
111 * appropriate.
112 */
113struct xfs_da3_icnode_hdr {
114 __uint32_t forw;
115 __uint32_t back;
116 __uint16_t magic;
117 __uint16_t count;
118 __uint16_t level;
119};
120
121/*
122 * Directory version 2.
123 *
124 * There are 4 possible formats:
125 * - shortform - embedded into the inode
126 * - single block - data with embedded leaf at the end
127 * - multiple data blocks, single leaf+freeindex block
128 * - data blocks, node and leaf blocks (btree), freeindex blocks
129 *
130 * Note: many node blocks structures and constants are shared with the attr
131 * code and defined in xfs_da_btree.h.
132 */
133
134#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */
135#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */
136#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */
137
138/*
139 * Directory Version 3 With CRCs.
140 *
141 * The tree formats are the same as for version 2 directories. The difference
142 * is in the block header and dirent formats. In many cases the v3 structures
143 * use v2 definitions as they are no different and this makes code sharing much
144 * easier.
145 *
146 * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
147 * format is v2 then they switch to the existing v2 code, or the format is v3
148 * they implement the v3 functionality. This means the existing dir2 is a mix of
149 * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
150 * where there is a difference in the formats, otherwise the code is unchanged.
151 *
152 * Where it is possible, the code decides what to do based on the magic numbers
153 * in the blocks rather than feature bits in the superblock. This means the code
154 * is as independent of the external XFS code as possible as doesn't require
155 * passing struct xfs_mount pointers into places where it isn't really
156 * necessary.
157 *
158 * Version 3 includes:
159 *
160 * - a larger block header for CRC and identification purposes and so the
161 * offsets of all the structures inside the blocks are different.
162 *
163 * - new magic numbers to be able to detect the v2/v3 types on the fly.
164 */
165
166#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */
167#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */
168#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */
169
170/*
171 * Dirents in version 3 directories have a file type field. Additions to this
172 * list are an on-disk format change, requiring feature bits. Valid values
173 * are as follows:
174 */
175#define XFS_DIR3_FT_UNKNOWN 0
176#define XFS_DIR3_FT_REG_FILE 1
177#define XFS_DIR3_FT_DIR 2
178#define XFS_DIR3_FT_CHRDEV 3
179#define XFS_DIR3_FT_BLKDEV 4
180#define XFS_DIR3_FT_FIFO 5
181#define XFS_DIR3_FT_SOCK 6
182#define XFS_DIR3_FT_SYMLINK 7
183#define XFS_DIR3_FT_WHT 8
184
185#define XFS_DIR3_FT_MAX 9
186
187/*
188 * Byte offset in data block and shortform entry.
189 */
190typedef __uint16_t xfs_dir2_data_off_t;
191#define NULLDATAOFF 0xffffU
192typedef uint xfs_dir2_data_aoff_t; /* argument form */
193
194/*
195 * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
196 * Only need 16 bits, this is the byte offset into the single block form.
197 */
198typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
199
200/*
201 * Offset in data space of a data entry.
202 */
203typedef __uint32_t xfs_dir2_dataptr_t;
204#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff)
205#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0)
206
207/*
208 * Byte offset in a directory.
209 */
210typedef xfs_off_t xfs_dir2_off_t;
211
212/*
213 * Directory block number (logical dirblk in file)
214 */
215typedef __uint32_t xfs_dir2_db_t;
216
217/*
218 * Inode number stored as 8 8-bit values.
219 */
220typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
221
222/*
223 * Inode number stored as 4 8-bit values.
224 * Works a lot of the time, when all the inode numbers in a directory
225 * fit in 32 bits.
226 */
227typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
228
229typedef union {
230 xfs_dir2_ino8_t i8;
231 xfs_dir2_ino4_t i4;
232} xfs_dir2_inou_t;
233#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
234
235/*
236 * Directory layout when stored internal to an inode.
237 *
238 * Small directories are packed as tightly as possible so as to fit into the
239 * literal area of the inode. These "shortform" directories consist of a
240 * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
241 * structures. Due the different inode number storage size and the variable
242 * length name field in the xfs_dir2_sf_entry all these structure are
243 * variable length, and the accessors in this file should be used to iterate
244 * over them.
245 */
246typedef struct xfs_dir2_sf_hdr {
247 __uint8_t count; /* count of entries */
248 __uint8_t i8count; /* count of 8-byte inode #s */
249 xfs_dir2_inou_t parent; /* parent dir inode number */
250} __arch_pack xfs_dir2_sf_hdr_t;
251
252typedef struct xfs_dir2_sf_entry {
253 __u8 namelen; /* actual name length */
254 xfs_dir2_sf_off_t offset; /* saved offset */
255 __u8 name[]; /* name, variable size */
256 /*
257 * A single byte containing the file type field follows the inode
258 * number for version 3 directory entries.
259 *
260 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
261 * variable offset after the name.
262 */
263} __arch_pack xfs_dir2_sf_entry_t;
264
265static inline int xfs_dir2_sf_hdr_size(int i8count)
266{
267 return sizeof(struct xfs_dir2_sf_hdr) -
268 (i8count == 0) *
269 (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
270}
271
272static inline xfs_dir2_data_aoff_t
273xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
274{
275 return get_unaligned_be16(&sfep->offset.i);
276}
277
278static inline void
279xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
280{
281 put_unaligned_be16(off, &sfep->offset.i);
282}
283
284static inline struct xfs_dir2_sf_entry *
285xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
286{
287 return (struct xfs_dir2_sf_entry *)
288 ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
289}
290
291/*
292 * Data block structures.
293 *
294 * A pure data block looks like the following drawing on disk:
295 *
296 * +-------------------------------------------------+
297 * | xfs_dir2_data_hdr_t |
298 * +-------------------------------------------------+
299 * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
300 * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
301 * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
302 * | ... |
303 * +-------------------------------------------------+
304 * | unused space |
305 * +-------------------------------------------------+
306 *
307 * As all the entries are variable size structures the accessors below should
308 * be used to iterate over them.
309 *
310 * In addition to the pure data blocks for the data and node formats,
311 * most structures are also used for the combined data/freespace "block"
312 * format below.
313 */
314
315#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */
316#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG)
317#define XFS_DIR2_DATA_FREE_TAG 0xffff
318#define XFS_DIR2_DATA_FD_COUNT 3
319
320/*
321 * Directory address space divided into sections,
322 * spaces separated by 32GB.
323 */
324#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
325#define XFS_DIR2_DATA_SPACE 0
326#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
327
328/*
329 * Describe a free area in the data block.
330 *
331 * The freespace will be formatted as a xfs_dir2_data_unused_t.
332 */
333typedef struct xfs_dir2_data_free {
334 __be16 offset; /* start of freespace */
335 __be16 length; /* length of freespace */
336} xfs_dir2_data_free_t;
337
338/*
339 * Header for the data blocks.
340 *
341 * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
342 */
343typedef struct xfs_dir2_data_hdr {
344 __be32 magic; /* XFS_DIR2_DATA_MAGIC or */
345 /* XFS_DIR2_BLOCK_MAGIC */
346 xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT];
347} xfs_dir2_data_hdr_t;
348
349/*
350 * define a structure for all the verification fields we are adding to the
351 * directory block structures. This will be used in several structures.
352 * The magic number must be the first entry to align with all the dir2
353 * structures so we determine how to decode them just by the magic number.
354 */
355struct xfs_dir3_blk_hdr {
356 __be32 magic; /* magic number */
357 __be32 crc; /* CRC of block */
358 __be64 blkno; /* first block of the buffer */
359 __be64 lsn; /* sequence number of last write */
360 uuid_t uuid; /* filesystem we belong to */
361 __be64 owner; /* inode that owns the block */
362};
363
364struct xfs_dir3_data_hdr {
365 struct xfs_dir3_blk_hdr hdr;
366 xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT];
367 __be32 pad; /* 64 bit alignment */
368};
369
370#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc)
371
372/*
373 * Active entry in a data block.
374 *
375 * Aligned to 8 bytes. After the variable length name field there is a
376 * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
377 *
378 * For dir3 structures, there is file type field between the name and the tag.
379 * This can only be manipulated by helper functions. It is packed hard against
380 * the end of the name so any padding for rounding is between the file type and
381 * the tag.
382 */
383typedef struct xfs_dir2_data_entry {
384 __be64 inumber; /* inode number */
385 __u8 namelen; /* name length */
386 __u8 name[]; /* name bytes, no null */
387 /* __u8 filetype; */ /* type of inode we point to */
388 /* __be16 tag; */ /* starting offset of us */
389} xfs_dir2_data_entry_t;
390
391/*
392 * Unused entry in a data block.
393 *
394 * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed
395 * using xfs_dir2_data_unused_tag_p.
396 */
397typedef struct xfs_dir2_data_unused {
398 __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */
399 __be16 length; /* total free length */
400 /* variable offset */
401 __be16 tag; /* starting offset of us */
402} xfs_dir2_data_unused_t;
403
404/*
405 * Pointer to a freespace's tag word.
406 */
407static inline __be16 *
408xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
409{
410 return (__be16 *)((char *)dup +
411 be16_to_cpu(dup->length) - sizeof(__be16));
412}
413
414/*
415 * Leaf block structures.
416 *
417 * A pure leaf block looks like the following drawing on disk:
418 *
419 * +---------------------------+
420 * | xfs_dir2_leaf_hdr_t |
421 * +---------------------------+
422 * | xfs_dir2_leaf_entry_t |
423 * | xfs_dir2_leaf_entry_t |
424 * | xfs_dir2_leaf_entry_t |
425 * | xfs_dir2_leaf_entry_t |
426 * | ... |
427 * +---------------------------+
428 * | xfs_dir2_data_off_t |
429 * | xfs_dir2_data_off_t |
430 * | xfs_dir2_data_off_t |
431 * | ... |
432 * +---------------------------+
433 * | xfs_dir2_leaf_tail_t |
434 * +---------------------------+
435 *
436 * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
437 * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
438 * for directories with separate leaf nodes and free space blocks
439 * (magic = XFS_DIR2_LEAFN_MAGIC).
440 *
441 * As all the entries are variable size structures the accessors below should
442 * be used to iterate over them.
443 */
444
445/*
446 * Offset of the leaf/node space. First block in this space
447 * is the btree root.
448 */
449#define XFS_DIR2_LEAF_SPACE 1
450#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
451
452/*
453 * Leaf block header.
454 */
455typedef struct xfs_dir2_leaf_hdr {
456 xfs_da_blkinfo_t info; /* header for da routines */
457 __be16 count; /* count of entries */
458 __be16 stale; /* count of stale entries */
459} xfs_dir2_leaf_hdr_t;
460
461struct xfs_dir3_leaf_hdr {
462 struct xfs_da3_blkinfo info; /* header for da routines */
463 __be16 count; /* count of entries */
464 __be16 stale; /* count of stale entries */
465 __be32 pad; /* 64 bit alignment */
466};
467
468struct xfs_dir3_icleaf_hdr {
469 __uint32_t forw;
470 __uint32_t back;
471 __uint16_t magic;
472 __uint16_t count;
473 __uint16_t stale;
474};
475
476/*
477 * Leaf block entry.
478 */
479typedef struct xfs_dir2_leaf_entry {
480 __be32 hashval; /* hash value of name */
481 __be32 address; /* address of data entry */
482} xfs_dir2_leaf_entry_t;
483
484/*
485 * Leaf block tail.
486 */
487typedef struct xfs_dir2_leaf_tail {
488 __be32 bestcount;
489} xfs_dir2_leaf_tail_t;
490
491/*
492 * Leaf block.
493 */
494typedef struct xfs_dir2_leaf {
495 xfs_dir2_leaf_hdr_t hdr; /* leaf header */
496 xfs_dir2_leaf_entry_t __ents[]; /* entries */
497} xfs_dir2_leaf_t;
498
499struct xfs_dir3_leaf {
500 struct xfs_dir3_leaf_hdr hdr; /* leaf header */
501 struct xfs_dir2_leaf_entry __ents[]; /* entries */
502};
503
504#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc)
505
506/*
507 * Get address of the bests array in the single-leaf block.
508 */
509static inline __be16 *
510xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
511{
512 return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
513}
514
515/*
516 * Free space block defintions for the node format.
517 */
518
519/*
520 * Offset of the freespace index.
521 */
522#define XFS_DIR2_FREE_SPACE 2
523#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
524
525typedef struct xfs_dir2_free_hdr {
526 __be32 magic; /* XFS_DIR2_FREE_MAGIC */
527 __be32 firstdb; /* db of first entry */
528 __be32 nvalid; /* count of valid entries */
529 __be32 nused; /* count of used entries */
530} xfs_dir2_free_hdr_t;
531
532typedef struct xfs_dir2_free {
533 xfs_dir2_free_hdr_t hdr; /* block header */
534 __be16 bests[]; /* best free counts */
535 /* unused entries are -1 */
536} xfs_dir2_free_t;
537
538struct xfs_dir3_free_hdr {
539 struct xfs_dir3_blk_hdr hdr;
540 __be32 firstdb; /* db of first entry */
541 __be32 nvalid; /* count of valid entries */
542 __be32 nused; /* count of used entries */
543 __be32 pad; /* 64 bit alignment */
544};
545
546struct xfs_dir3_free {
547 struct xfs_dir3_free_hdr hdr;
548 __be16 bests[]; /* best free counts */
549 /* unused entries are -1 */
550};
551
552#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
553
554/*
555 * In core version of the free block header, abstracted away from on-disk format
556 * differences. Use this in the code, and convert to/from the disk version using
557 * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
558 */
559struct xfs_dir3_icfree_hdr {
560 __uint32_t magic;
561 __uint32_t firstdb;
562 __uint32_t nvalid;
563 __uint32_t nused;
564
565};
566
567/*
568 * Single block format.
569 *
570 * The single block format looks like the following drawing on disk:
571 *
572 * +-------------------------------------------------+
573 * | xfs_dir2_data_hdr_t |
574 * +-------------------------------------------------+
575 * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
576 * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
577 * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
578 * | ... |
579 * +-------------------------------------------------+
580 * | unused space |
581 * +-------------------------------------------------+
582 * | ... |
583 * | xfs_dir2_leaf_entry_t |
584 * | xfs_dir2_leaf_entry_t |
585 * +-------------------------------------------------+
586 * | xfs_dir2_block_tail_t |
587 * +-------------------------------------------------+
588 *
589 * As all the entries are variable size structures the accessors below should
590 * be used to iterate over them.
591 */
592
593typedef struct xfs_dir2_block_tail {
594 __be32 count; /* count of leaf entries */
595 __be32 stale; /* count of stale lf entries */
596} xfs_dir2_block_tail_t;
597
598/*
599 * Pointer to the leaf entries embedded in a data block (1-block format)
600 */
601static inline struct xfs_dir2_leaf_entry *
602xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
603{
604 return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
605}
606
607
608/*
609 * Attribute storage layout
610 *
611 * Attribute lists are structured around Btrees where all the data
612 * elements are in the leaf nodes. Attribute names are hashed into an int,
613 * then that int is used as the index into the Btree. Since the hashval
614 * of an attribute name may not be unique, we may have duplicate keys. The
615 * internal links in the Btree are logical block offsets into the file.
616 *
617 * Struct leaf_entry's are packed from the top. Name/values grow from the
618 * bottom but are not packed. The freemap contains run-length-encoded entries
619 * for the free bytes after the leaf_entry's, but only the N largest such,
620 * smaller runs are dropped. When the freemap doesn't show enough space
621 * for an allocation, we compact the name/value area and try again. If we
622 * still don't have enough space, then we have to split the block. The
623 * name/value structs (both local and remote versions) must be 32bit aligned.
624 *
625 * Since we have duplicate hash keys, for each key that matches, compare
626 * the actual name string. The root and intermediate node search always
627 * takes the first-in-the-block key match found, so we should only have
628 * to work "forw"ard. If none matches, continue with the "forw"ard leaf
629 * nodes until the hash key changes or the attribute name is found.
630 *
631 * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
632 * the leaf_entry. The namespaces are independent only because we also look
633 * at the namespace bit when we are looking for a matching attribute name.
634 *
635 * We also store an "incomplete" bit in the leaf_entry. It shows that an
636 * attribute is in the middle of being created and should not be shown to
637 * the user if we crash during the time that the bit is set. We clear the
638 * bit when we have finished setting up the attribute. We do this because
639 * we cannot create some large attributes inside a single transaction, and we
640 * need some indication that we weren't finished if we crash in the middle.
641 */
642#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
643
644typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
645 __be16 base; /* base of free region */
646 __be16 size; /* length of free region */
647} xfs_attr_leaf_map_t;
648
649typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
650 xfs_da_blkinfo_t info; /* block type, links, etc. */
651 __be16 count; /* count of active leaf_entry's */
652 __be16 usedbytes; /* num bytes of names/values stored */
653 __be16 firstused; /* first used byte in name area */
654 __u8 holes; /* != 0 if blk needs compaction */
655 __u8 pad1;
656 xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
657 /* N largest free regions */
658} xfs_attr_leaf_hdr_t;
659
660typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
661 __be32 hashval; /* hash value of name */
662 __be16 nameidx; /* index into buffer of name/value */
663 __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
664 __u8 pad2; /* unused pad byte */
665} xfs_attr_leaf_entry_t;
666
667typedef struct xfs_attr_leaf_name_local {
668 __be16 valuelen; /* number of bytes in value */
669 __u8 namelen; /* length of name bytes */
670 __u8 nameval[1]; /* name/value bytes */
671} xfs_attr_leaf_name_local_t;
672
673typedef struct xfs_attr_leaf_name_remote {
674 __be32 valueblk; /* block number of value bytes */
675 __be32 valuelen; /* number of bytes in value */
676 __u8 namelen; /* length of name bytes */
677 __u8 name[1]; /* name bytes */
678} xfs_attr_leaf_name_remote_t;
679
680typedef struct xfs_attr_leafblock {
681 xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
682 xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
683 xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
684 xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
685} xfs_attr_leafblock_t;
686
687/*
688 * CRC enabled leaf structures. Called "version 3" structures to match the
689 * version number of the directory and dablk structures for this feature, and
690 * attr2 is already taken by the variable inode attribute fork size feature.
691 */
692struct xfs_attr3_leaf_hdr {
693 struct xfs_da3_blkinfo info;
694 __be16 count;
695 __be16 usedbytes;
696 __be16 firstused;
697 __u8 holes;
698 __u8 pad1;
699 struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
700 __be32 pad2; /* 64 bit alignment */
701};
702
703#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
704
705struct xfs_attr3_leafblock {
706 struct xfs_attr3_leaf_hdr hdr;
707 struct xfs_attr_leaf_entry entries[1];
708
709 /*
710 * The rest of the block contains the following structures after the
711 * leaf entries, growing from the bottom up. The variables are never
712 * referenced, the locations accessed purely from helper functions.
713 *
714 * struct xfs_attr_leaf_name_local
715 * struct xfs_attr_leaf_name_remote
716 */
717};
718
719/*
720 * incore, neutral version of the attribute leaf header
721 */
722struct xfs_attr3_icleaf_hdr {
723 __uint32_t forw;
724 __uint32_t back;
725 __uint16_t magic;
726 __uint16_t count;
727 __uint16_t usedbytes;
728 __uint16_t firstused;
729 __u8 holes;
730 struct {
731 __uint16_t base;
732 __uint16_t size;
733 } freemap[XFS_ATTR_LEAF_MAPSIZE];
734};
735
736/*
737 * Flags used in the leaf_entry[i].flags field.
738 * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
739 * on the system call, they are "or"ed together for various operations.
740 */
741#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
742#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
743#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
744#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
745#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
746#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
747#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
748#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
749
750/*
751 * Conversion macros for converting namespace bits from argument flags
752 * to ondisk flags.
753 */
754#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
755#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
756#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
757#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
758#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
759 ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
760#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
761 ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
762
763/*
764 * Alignment for namelist and valuelist entries (since they are mixed
765 * there can be only one alignment value)
766 */
767#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
768
769static inline int
770xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
771{
772 if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
773 return sizeof(struct xfs_attr3_leaf_hdr);
774 return sizeof(struct xfs_attr_leaf_hdr);
775}
776
777static inline struct xfs_attr_leaf_entry *
778xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
779{
780 if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
781 return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
782 return &leafp->entries[0];
783}
784
785/*
786 * Cast typed pointers for "local" and "remote" name/value structs.
787 */
788static inline char *
789xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
790{
791 struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
792
793 return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
794}
795
796static inline xfs_attr_leaf_name_remote_t *
797xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
798{
799 return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
800}
801
802static inline xfs_attr_leaf_name_local_t *
803xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
804{
805 return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
806}
807
808/*
809 * Calculate total bytes used (including trailing pad for alignment) for
810 * a "local" name/value structure, a "remote" name/value structure, and
811 * a pointer which might be either.
812 */
813static inline int xfs_attr_leaf_entsize_remote(int nlen)
814{
815 return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
816 XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
817}
818
819static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
820{
821 return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
822 XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
823}
824
825static inline int xfs_attr_leaf_entsize_local_max(int bsize)
826{
827 return (((bsize) >> 1) + ((bsize) >> 2));
828}
829
830
831
832/*
833 * Remote attribute block format definition
834 *
835 * There is one of these headers per filesystem block in a remote attribute.
836 * This is done to ensure there is a 1:1 mapping between the attribute value
837 * length and the number of blocks needed to store the attribute. This makes the
838 * verification of a buffer a little more complex, but greatly simplifies the
839 * allocation, reading and writing of these attributes as we don't have to guess
840 * the number of blocks needed to store the attribute data.
841 */
842#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */
843
844struct xfs_attr3_rmt_hdr {
845 __be32 rm_magic;
846 __be32 rm_offset;
847 __be32 rm_bytes;
848 __be32 rm_crc;
849 uuid_t rm_uuid;
850 __be64 rm_owner;
851 __be64 rm_blkno;
852 __be64 rm_lsn;
853};
854
855#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
856
857#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
858 ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
859 sizeof(struct xfs_attr3_rmt_hdr) : 0))
860
861#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
new file mode 100644
index 000000000000..623bbe8fd921
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dinode.h
@@ -0,0 +1,243 @@
1/*
2 * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DINODE_H__
19#define __XFS_DINODE_H__
20
21#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
22#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
23
24typedef struct xfs_timestamp {
25 __be32 t_sec; /* timestamp seconds */
26 __be32 t_nsec; /* timestamp nanoseconds */
27} xfs_timestamp_t;
28
29/*
30 * On-disk inode structure.
31 *
32 * This is just the header or "dinode core", the inode is expanded to fill a
33 * variable size the leftover area split into a data and an attribute fork.
34 * The format of the data and attribute fork depends on the format of the
35 * inode as indicated by di_format and di_aformat. To access the data and
36 * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
37 * below.
38 *
39 * There is a very similar struct icdinode in xfs_inode which matches the
40 * layout of the first 96 bytes of this structure, but is kept in native
41 * format instead of big endian.
42 *
43 * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
44 * padding field for v3 inodes.
45 */
46typedef struct xfs_dinode {
47 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
48 __be16 di_mode; /* mode and type of file */
49 __u8 di_version; /* inode version */
50 __u8 di_format; /* format of di_c data */
51 __be16 di_onlink; /* old number of links to file */
52 __be32 di_uid; /* owner's user id */
53 __be32 di_gid; /* owner's group id */
54 __be32 di_nlink; /* number of links to file */
55 __be16 di_projid_lo; /* lower part of owner's project id */
56 __be16 di_projid_hi; /* higher part owner's project id */
57 __u8 di_pad[6]; /* unused, zeroed space */
58 __be16 di_flushiter; /* incremented on flush */
59 xfs_timestamp_t di_atime; /* time last accessed */
60 xfs_timestamp_t di_mtime; /* time last modified */
61 xfs_timestamp_t di_ctime; /* time created/inode modified */
62 __be64 di_size; /* number of bytes in file */
63 __be64 di_nblocks; /* # of direct & btree blocks used */
64 __be32 di_extsize; /* basic/minimum extent size for file */
65 __be32 di_nextents; /* number of extents in data fork */
66 __be16 di_anextents; /* number of extents in attribute fork*/
67 __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
68 __s8 di_aformat; /* format of attr fork's data */
69 __be32 di_dmevmask; /* DMIG event mask */
70 __be16 di_dmstate; /* DMIG state info */
71 __be16 di_flags; /* random flags, XFS_DIFLAG_... */
72 __be32 di_gen; /* generation number */
73
74 /* di_next_unlinked is the only non-core field in the old dinode */
75 __be32 di_next_unlinked;/* agi unlinked list ptr */
76
77 /* start of the extended dinode, writable fields */
78 __le32 di_crc; /* CRC of the inode */
79 __be64 di_changecount; /* number of attribute changes */
80 __be64 di_lsn; /* flush sequence */
81 __be64 di_flags2; /* more random flags */
82 __u8 di_pad2[16]; /* more padding for future expansion */
83
84 /* fields only written to during inode creation */
85 xfs_timestamp_t di_crtime; /* time created */
86 __be64 di_ino; /* inode number */
87 uuid_t di_uuid; /* UUID of the filesystem */
88
89 /* structure must be padded to 64 bit alignment */
90} xfs_dinode_t;
91
92#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
93
94#define DI_MAX_FLUSH 0xffff
95
96/*
97 * Size of the core inode on disk. Version 1 and 2 inodes have
98 * the same size, but version 3 has grown a few additional fields.
99 */
100static inline uint xfs_dinode_size(int version)
101{
102 if (version == 3)
103 return sizeof(struct xfs_dinode);
104 return offsetof(struct xfs_dinode, di_crc);
105}
106
107/*
108 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
109 * Since the pathconf interface is signed, we use 2^31 - 1 instead.
110 * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
111 */
112#define XFS_MAXLINK ((1U << 31) - 1U)
113#define XFS_MAXLINK_1 65535U
114
115/*
116 * Values for di_format
117 */
118typedef enum xfs_dinode_fmt {
119 XFS_DINODE_FMT_DEV, /* xfs_dev_t */
120 XFS_DINODE_FMT_LOCAL, /* bulk data */
121 XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
122 XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
123 XFS_DINODE_FMT_UUID /* uuid_t */
124} xfs_dinode_fmt_t;
125
126/*
127 * Inode minimum and maximum sizes.
128 */
129#define XFS_DINODE_MIN_LOG 8
130#define XFS_DINODE_MAX_LOG 11
131#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG)
132#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG)
133
134/*
135 * Inode size for given fs.
136 */
137#define XFS_LITINO(mp, version) \
138 ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
139
140/*
141 * Inode data & attribute fork sizes, per inode.
142 */
143#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
144#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
145
146#define XFS_DFORK_DSIZE(dip,mp) \
147 (XFS_DFORK_Q(dip) ? \
148 XFS_DFORK_BOFF(dip) : \
149 XFS_LITINO(mp, (dip)->di_version))
150#define XFS_DFORK_ASIZE(dip,mp) \
151 (XFS_DFORK_Q(dip) ? \
152 XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
153 0)
154#define XFS_DFORK_SIZE(dip,mp,w) \
155 ((w) == XFS_DATA_FORK ? \
156 XFS_DFORK_DSIZE(dip, mp) : \
157 XFS_DFORK_ASIZE(dip, mp))
158
159/*
160 * Return pointers to the data or attribute forks.
161 */
162#define XFS_DFORK_DPTR(dip) \
163 ((char *)dip + xfs_dinode_size(dip->di_version))
164#define XFS_DFORK_APTR(dip) \
165 (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
166#define XFS_DFORK_PTR(dip,w) \
167 ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
168
169#define XFS_DFORK_FORMAT(dip,w) \
170 ((w) == XFS_DATA_FORK ? \
171 (dip)->di_format : \
172 (dip)->di_aformat)
173#define XFS_DFORK_NEXTENTS(dip,w) \
174 ((w) == XFS_DATA_FORK ? \
175 be32_to_cpu((dip)->di_nextents) : \
176 be16_to_cpu((dip)->di_anextents))
177
178#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr))
179
180/*
181 * For block and character special files the 32bit dev_t is stored at the
182 * beginning of the data fork.
183 */
184static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
185{
186 return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
187}
188
189static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
190{
191 *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
192}
193
194/*
195 * Values for di_flags
196 * There should be a one-to-one correspondence between these flags and the
197 * XFS_XFLAG_s.
198 */
199#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
200#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
201#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */
202#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */
203#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */
204#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */
205#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
206#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
207#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
208#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
209#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
210#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
211#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
212#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
213#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
214#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
215#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
216#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
217#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT)
218#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT)
219#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT)
220#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT)
221#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT)
222#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
223#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
224#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
225#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
226#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
227#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
228#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
229
230#ifdef CONFIG_XFS_RT
231#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
232#else
233#define XFS_IS_REALTIME_INODE(ip) (0)
234#endif
235
236#define XFS_DIFLAG_ANY \
237 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
238 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
239 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
240 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
241 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
242
243#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
new file mode 100644
index 000000000000..6cef22152fd6
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -0,0 +1,762 @@
1/*
2 * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_format.h"
21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h"
23#include "xfs_inum.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27#include "xfs_da_format.h"
28#include "xfs_da_btree.h"
29#include "xfs_inode.h"
30#include "xfs_trans.h"
31#include "xfs_inode_item.h"
32#include "xfs_bmap.h"
33#include "xfs_dir2.h"
34#include "xfs_dir2_priv.h"
35#include "xfs_error.h"
36#include "xfs_trace.h"
37#include "xfs_dinode.h"
38
39struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
40
41
42/*
43 * ASCII case-insensitive (ie. A-Z) support for directories that was
44 * used in IRIX.
45 */
46STATIC xfs_dahash_t
47xfs_ascii_ci_hashname(
48 struct xfs_name *name)
49{
50 xfs_dahash_t hash;
51 int i;
52
53 for (i = 0, hash = 0; i < name->len; i++)
54 hash = tolower(name->name[i]) ^ rol32(hash, 7);
55
56 return hash;
57}
58
59STATIC enum xfs_dacmp
60xfs_ascii_ci_compname(
61 struct xfs_da_args *args,
62 const unsigned char *name,
63 int len)
64{
65 enum xfs_dacmp result;
66 int i;
67
68 if (args->namelen != len)
69 return XFS_CMP_DIFFERENT;
70
71 result = XFS_CMP_EXACT;
72 for (i = 0; i < len; i++) {
73 if (args->name[i] == name[i])
74 continue;
75 if (tolower(args->name[i]) != tolower(name[i]))
76 return XFS_CMP_DIFFERENT;
77 result = XFS_CMP_CASE;
78 }
79
80 return result;
81}
82
83static struct xfs_nameops xfs_ascii_ci_nameops = {
84 .hashname = xfs_ascii_ci_hashname,
85 .compname = xfs_ascii_ci_compname,
86};
87
88int
89xfs_da_mount(
90 struct xfs_mount *mp)
91{
92 struct xfs_da_geometry *dageo;
93 int nodehdr_size;
94
95
96 ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
97 ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
98 XFS_MAX_BLOCKSIZE);
99
100 mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
101 mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
102
103 nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
104 mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
105 KM_SLEEP | KM_MAYFAIL);
106 mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
107 KM_SLEEP | KM_MAYFAIL);
108 if (!mp->m_dir_geo || !mp->m_attr_geo) {
109 kmem_free(mp->m_dir_geo);
110 kmem_free(mp->m_attr_geo);
111 return -ENOMEM;
112 }
113
114 /* set up directory geometry */
115 dageo = mp->m_dir_geo;
116 dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
117 dageo->fsblog = mp->m_sb.sb_blocklog;
118 dageo->blksize = 1 << dageo->blklog;
119 dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
120
121 /*
122 * Now we've set up the block conversion variables, we can calculate the
123 * segment block constants using the geometry structure.
124 */
125 dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
126 dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
127 dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
128 dageo->node_ents = (dageo->blksize - nodehdr_size) /
129 (uint)sizeof(xfs_da_node_entry_t);
130 dageo->magicpct = (dageo->blksize * 37) / 100;
131
132 /* set up attribute geometry - single fsb only */
133 dageo = mp->m_attr_geo;
134 dageo->blklog = mp->m_sb.sb_blocklog;
135 dageo->fsblog = mp->m_sb.sb_blocklog;
136 dageo->blksize = 1 << dageo->blklog;
137 dageo->fsbcount = 1;
138 dageo->node_ents = (dageo->blksize - nodehdr_size) /
139 (uint)sizeof(xfs_da_node_entry_t);
140 dageo->magicpct = (dageo->blksize * 37) / 100;
141
142 if (xfs_sb_version_hasasciici(&mp->m_sb))
143 mp->m_dirnameops = &xfs_ascii_ci_nameops;
144 else
145 mp->m_dirnameops = &xfs_default_nameops;
146
147 return 0;
148}
149
150void
151xfs_da_unmount(
152 struct xfs_mount *mp)
153{
154 kmem_free(mp->m_dir_geo);
155 kmem_free(mp->m_attr_geo);
156}
157
158/*
159 * Return 1 if directory contains only "." and "..".
160 */
161int
162xfs_dir_isempty(
163 xfs_inode_t *dp)
164{
165 xfs_dir2_sf_hdr_t *sfp;
166
167 ASSERT(S_ISDIR(dp->i_d.di_mode));
168 if (dp->i_d.di_size == 0) /* might happen during shutdown. */
169 return 1;
170 if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
171 return 0;
172 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
173 return !sfp->count;
174}
175
176/*
177 * Validate a given inode number.
178 */
179int
180xfs_dir_ino_validate(
181 xfs_mount_t *mp,
182 xfs_ino_t ino)
183{
184 xfs_agblock_t agblkno;
185 xfs_agino_t agino;
186 xfs_agnumber_t agno;
187 int ino_ok;
188 int ioff;
189
190 agno = XFS_INO_TO_AGNO(mp, ino);
191 agblkno = XFS_INO_TO_AGBNO(mp, ino);
192 ioff = XFS_INO_TO_OFFSET(mp, ino);
193 agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
194 ino_ok =
195 agno < mp->m_sb.sb_agcount &&
196 agblkno < mp->m_sb.sb_agblocks &&
197 agblkno != 0 &&
198 ioff < (1 << mp->m_sb.sb_inopblog) &&
199 XFS_AGINO_TO_INO(mp, agno, agino) == ino;
200 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
201 XFS_RANDOM_DIR_INO_VALIDATE))) {
202 xfs_warn(mp, "Invalid inode number 0x%Lx",
203 (unsigned long long) ino);
204 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
205 return -EFSCORRUPTED;
206 }
207 return 0;
208}
209
210/*
211 * Initialize a directory with its "." and ".." entries.
212 */
213int
214xfs_dir_init(
215 xfs_trans_t *tp,
216 xfs_inode_t *dp,
217 xfs_inode_t *pdp)
218{
219 struct xfs_da_args *args;
220 int error;
221
222 ASSERT(S_ISDIR(dp->i_d.di_mode));
223 error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
224 if (error)
225 return error;
226
227 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
228 if (!args)
229 return -ENOMEM;
230
231 args->geo = dp->i_mount->m_dir_geo;
232 args->dp = dp;
233 args->trans = tp;
234 error = xfs_dir2_sf_create(args, pdp->i_ino);
235 kmem_free(args);
236 return error;
237}
238
239/*
240 Enter a name in a directory.
241 */
242int
243xfs_dir_createname(
244 xfs_trans_t *tp,
245 xfs_inode_t *dp,
246 struct xfs_name *name,
247 xfs_ino_t inum, /* new entry inode number */
248 xfs_fsblock_t *first, /* bmap's firstblock */
249 xfs_bmap_free_t *flist, /* bmap's freeblock list */
250 xfs_extlen_t total) /* bmap's total block count */
251{
252 struct xfs_da_args *args;
253 int rval;
254 int v; /* type-checking value */
255
256 ASSERT(S_ISDIR(dp->i_d.di_mode));
257 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
258 if (rval)
259 return rval;
260 XFS_STATS_INC(xs_dir_create);
261
262 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
263 if (!args)
264 return -ENOMEM;
265
266 args->geo = dp->i_mount->m_dir_geo;
267 args->name = name->name;
268 args->namelen = name->len;
269 args->filetype = name->type;
270 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
271 args->inumber = inum;
272 args->dp = dp;
273 args->firstblock = first;
274 args->flist = flist;
275 args->total = total;
276 args->whichfork = XFS_DATA_FORK;
277 args->trans = tp;
278 args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
279
280 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
281 rval = xfs_dir2_sf_addname(args);
282 goto out_free;
283 }
284
285 rval = xfs_dir2_isblock(args, &v);
286 if (rval)
287 goto out_free;
288 if (v) {
289 rval = xfs_dir2_block_addname(args);
290 goto out_free;
291 }
292
293 rval = xfs_dir2_isleaf(args, &v);
294 if (rval)
295 goto out_free;
296 if (v)
297 rval = xfs_dir2_leaf_addname(args);
298 else
299 rval = xfs_dir2_node_addname(args);
300
301out_free:
302 kmem_free(args);
303 return rval;
304}
305
306/*
307 * If doing a CI lookup and case-insensitive match, dup actual name into
308 * args.value. Return EEXIST for success (ie. name found) or an error.
309 */
310int
311xfs_dir_cilookup_result(
312 struct xfs_da_args *args,
313 const unsigned char *name,
314 int len)
315{
316 if (args->cmpresult == XFS_CMP_DIFFERENT)
317 return -ENOENT;
318 if (args->cmpresult != XFS_CMP_CASE ||
319 !(args->op_flags & XFS_DA_OP_CILOOKUP))
320 return -EEXIST;
321
322 args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
323 if (!args->value)
324 return -ENOMEM;
325
326 memcpy(args->value, name, len);
327 args->valuelen = len;
328 return -EEXIST;
329}
330
331/*
332 * Lookup a name in a directory, give back the inode number.
333 * If ci_name is not NULL, returns the actual name in ci_name if it differs
334 * to name, or ci_name->name is set to NULL for an exact match.
335 */
336
337int
338xfs_dir_lookup(
339 xfs_trans_t *tp,
340 xfs_inode_t *dp,
341 struct xfs_name *name,
342 xfs_ino_t *inum, /* out: inode number */
343 struct xfs_name *ci_name) /* out: actual name if CI match */
344{
345 struct xfs_da_args *args;
346 int rval;
347 int v; /* type-checking value */
348
349 ASSERT(S_ISDIR(dp->i_d.di_mode));
350 XFS_STATS_INC(xs_dir_lookup);
351
352 /*
353 * We need to use KM_NOFS here so that lockdep will not throw false
354 * positive deadlock warnings on a non-transactional lookup path. It is
355 * safe to recurse into inode recalim in that case, but lockdep can't
356 * easily be taught about it. Hence KM_NOFS avoids having to add more
357 * lockdep Doing this avoids having to add a bunch of lockdep class
358 * annotations into the reclaim path for the ilock.
359 */
360 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
361 args->geo = dp->i_mount->m_dir_geo;
362 args->name = name->name;
363 args->namelen = name->len;
364 args->filetype = name->type;
365 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
366 args->dp = dp;
367 args->whichfork = XFS_DATA_FORK;
368 args->trans = tp;
369 args->op_flags = XFS_DA_OP_OKNOENT;
370 if (ci_name)
371 args->op_flags |= XFS_DA_OP_CILOOKUP;
372
373 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
374 rval = xfs_dir2_sf_lookup(args);
375 goto out_check_rval;
376 }
377
378 rval = xfs_dir2_isblock(args, &v);
379 if (rval)
380 goto out_free;
381 if (v) {
382 rval = xfs_dir2_block_lookup(args);
383 goto out_check_rval;
384 }
385
386 rval = xfs_dir2_isleaf(args, &v);
387 if (rval)
388 goto out_free;
389 if (v)
390 rval = xfs_dir2_leaf_lookup(args);
391 else
392 rval = xfs_dir2_node_lookup(args);
393
394out_check_rval:
395 if (rval == -EEXIST)
396 rval = 0;
397 if (!rval) {
398 *inum = args->inumber;
399 if (ci_name) {
400 ci_name->name = args->value;
401 ci_name->len = args->valuelen;
402 }
403 }
404out_free:
405 kmem_free(args);
406 return rval;
407}
408
409/*
410 * Remove an entry from a directory.
411 */
412int
413xfs_dir_removename(
414 xfs_trans_t *tp,
415 xfs_inode_t *dp,
416 struct xfs_name *name,
417 xfs_ino_t ino,
418 xfs_fsblock_t *first, /* bmap's firstblock */
419 xfs_bmap_free_t *flist, /* bmap's freeblock list */
420 xfs_extlen_t total) /* bmap's total block count */
421{
422 struct xfs_da_args *args;
423 int rval;
424 int v; /* type-checking value */
425
426 ASSERT(S_ISDIR(dp->i_d.di_mode));
427 XFS_STATS_INC(xs_dir_remove);
428
429 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
430 if (!args)
431 return -ENOMEM;
432
433 args->geo = dp->i_mount->m_dir_geo;
434 args->name = name->name;
435 args->namelen = name->len;
436 args->filetype = name->type;
437 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
438 args->inumber = ino;
439 args->dp = dp;
440 args->firstblock = first;
441 args->flist = flist;
442 args->total = total;
443 args->whichfork = XFS_DATA_FORK;
444 args->trans = tp;
445
446 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
447 rval = xfs_dir2_sf_removename(args);
448 goto out_free;
449 }
450
451 rval = xfs_dir2_isblock(args, &v);
452 if (rval)
453 goto out_free;
454 if (v) {
455 rval = xfs_dir2_block_removename(args);
456 goto out_free;
457 }
458
459 rval = xfs_dir2_isleaf(args, &v);
460 if (rval)
461 goto out_free;
462 if (v)
463 rval = xfs_dir2_leaf_removename(args);
464 else
465 rval = xfs_dir2_node_removename(args);
466out_free:
467 kmem_free(args);
468 return rval;
469}
470
471/*
472 * Replace the inode number of a directory entry.
473 */
474int
475xfs_dir_replace(
476 xfs_trans_t *tp,
477 xfs_inode_t *dp,
478 struct xfs_name *name, /* name of entry to replace */
479 xfs_ino_t inum, /* new inode number */
480 xfs_fsblock_t *first, /* bmap's firstblock */
481 xfs_bmap_free_t *flist, /* bmap's freeblock list */
482 xfs_extlen_t total) /* bmap's total block count */
483{
484 struct xfs_da_args *args;
485 int rval;
486 int v; /* type-checking value */
487
488 ASSERT(S_ISDIR(dp->i_d.di_mode));
489
490 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
491 if (rval)
492 return rval;
493
494 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
495 if (!args)
496 return -ENOMEM;
497
498 args->geo = dp->i_mount->m_dir_geo;
499 args->name = name->name;
500 args->namelen = name->len;
501 args->filetype = name->type;
502 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
503 args->inumber = inum;
504 args->dp = dp;
505 args->firstblock = first;
506 args->flist = flist;
507 args->total = total;
508 args->whichfork = XFS_DATA_FORK;
509 args->trans = tp;
510
511 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
512 rval = xfs_dir2_sf_replace(args);
513 goto out_free;
514 }
515
516 rval = xfs_dir2_isblock(args, &v);
517 if (rval)
518 goto out_free;
519 if (v) {
520 rval = xfs_dir2_block_replace(args);
521 goto out_free;
522 }
523
524 rval = xfs_dir2_isleaf(args, &v);
525 if (rval)
526 goto out_free;
527 if (v)
528 rval = xfs_dir2_leaf_replace(args);
529 else
530 rval = xfs_dir2_node_replace(args);
531out_free:
532 kmem_free(args);
533 return rval;
534}
535
536/*
537 * See if this entry can be added to the directory without allocating space.
538 * First checks that the caller couldn't reserve enough space (resblks = 0).
539 */
540int
541xfs_dir_canenter(
542 xfs_trans_t *tp,
543 xfs_inode_t *dp,
544 struct xfs_name *name, /* name of entry to add */
545 uint resblks)
546{
547 struct xfs_da_args *args;
548 int rval;
549 int v; /* type-checking value */
550
551 if (resblks)
552 return 0;
553
554 ASSERT(S_ISDIR(dp->i_d.di_mode));
555
556 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
557 if (!args)
558 return -ENOMEM;
559
560 args->geo = dp->i_mount->m_dir_geo;
561 args->name = name->name;
562 args->namelen = name->len;
563 args->filetype = name->type;
564 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
565 args->dp = dp;
566 args->whichfork = XFS_DATA_FORK;
567 args->trans = tp;
568 args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
569 XFS_DA_OP_OKNOENT;
570
571 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
572 rval = xfs_dir2_sf_addname(args);
573 goto out_free;
574 }
575
576 rval = xfs_dir2_isblock(args, &v);
577 if (rval)
578 goto out_free;
579 if (v) {
580 rval = xfs_dir2_block_addname(args);
581 goto out_free;
582 }
583
584 rval = xfs_dir2_isleaf(args, &v);
585 if (rval)
586 goto out_free;
587 if (v)
588 rval = xfs_dir2_leaf_addname(args);
589 else
590 rval = xfs_dir2_node_addname(args);
591out_free:
592 kmem_free(args);
593 return rval;
594}
595
596/*
597 * Utility routines.
598 */
599
600/*
601 * Add a block to the directory.
602 *
603 * This routine is for data and free blocks, not leaf/node blocks which are
604 * handled by xfs_da_grow_inode.
605 */
606int
607xfs_dir2_grow_inode(
608 struct xfs_da_args *args,
609 int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */
610 xfs_dir2_db_t *dbp) /* out: block number added */
611{
612 struct xfs_inode *dp = args->dp;
613 struct xfs_mount *mp = dp->i_mount;
614 xfs_fileoff_t bno; /* directory offset of new block */
615 int count; /* count of filesystem blocks */
616 int error;
617
618 trace_xfs_dir2_grow_inode(args, space);
619
620 /*
621 * Set lowest possible block in the space requested.
622 */
623 bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
624 count = args->geo->fsbcount;
625
626 error = xfs_da_grow_inode_int(args, &bno, count);
627 if (error)
628 return error;
629
630 *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
631
632 /*
633 * Update file's size if this is the data space and it grew.
634 */
635 if (space == XFS_DIR2_DATA_SPACE) {
636 xfs_fsize_t size; /* directory file (data) size */
637
638 size = XFS_FSB_TO_B(mp, bno + count);
639 if (size > dp->i_d.di_size) {
640 dp->i_d.di_size = size;
641 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
642 }
643 }
644 return 0;
645}
646
647/*
648 * See if the directory is a single-block form directory.
649 */
650int
651xfs_dir2_isblock(
652 struct xfs_da_args *args,
653 int *vp) /* out: 1 is block, 0 is not block */
654{
655 xfs_fileoff_t last; /* last file offset */
656 int rval;
657
658 if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
659 return rval;
660 rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
661 ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
662 *vp = rval;
663 return 0;
664}
665
666/*
667 * See if the directory is a single-leaf form directory.
668 */
669int
670xfs_dir2_isleaf(
671 struct xfs_da_args *args,
672 int *vp) /* out: 1 is block, 0 is not block */
673{
674 xfs_fileoff_t last; /* last file offset */
675 int rval;
676
677 if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
678 return rval;
679 *vp = last == args->geo->leafblk + args->geo->fsbcount;
680 return 0;
681}
682
683/*
684 * Remove the given block from the directory.
685 * This routine is used for data and free blocks, leaf/node are done
686 * by xfs_da_shrink_inode.
687 */
688int
689xfs_dir2_shrink_inode(
690 xfs_da_args_t *args,
691 xfs_dir2_db_t db,
692 struct xfs_buf *bp)
693{
694 xfs_fileoff_t bno; /* directory file offset */
695 xfs_dablk_t da; /* directory file offset */
696 int done; /* bunmap is finished */
697 xfs_inode_t *dp;
698 int error;
699 xfs_mount_t *mp;
700 xfs_trans_t *tp;
701
702 trace_xfs_dir2_shrink_inode(args, db);
703
704 dp = args->dp;
705 mp = dp->i_mount;
706 tp = args->trans;
707 da = xfs_dir2_db_to_da(args->geo, db);
708 /*
709 * Unmap the fsblock(s).
710 */
711 if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
712 XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
713 &done))) {
714 /*
715 * ENOSPC actually can happen if we're in a removename with
716 * no space reservation, and the resulting block removal
717 * would cause a bmap btree split or conversion from extents
718 * to btree. This can only happen for un-fragmented
719 * directory blocks, since you need to be punching out
720 * the middle of an extent.
721 * In this case we need to leave the block in the file,
722 * and not binval it.
723 * So the block has to be in a consistent empty state
724 * and appropriately logged.
725 * We don't free up the buffer, the caller can tell it
726 * hasn't happened since it got an error back.
727 */
728 return error;
729 }
730 ASSERT(done);
731 /*
732 * Invalidate the buffer from the transaction.
733 */
734 xfs_trans_binval(tp, bp);
735 /*
736 * If it's not a data block, we're done.
737 */
738 if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
739 return 0;
740 /*
741 * If the block isn't the last one in the directory, we're done.
742 */
743 if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
744 return 0;
745 bno = da;
746 if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
747 /*
748 * This can't really happen unless there's kernel corruption.
749 */
750 return error;
751 }
752 if (db == args->geo->datablk)
753 ASSERT(bno == 0);
754 else
755 ASSERT(bno > 0);
756 /*
757 * Set the size to the new last block.
758 */
759 dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
760 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
761 return 0;
762}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
new file mode 100644
index 000000000000..c8e86b0b5e99
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -0,0 +1,180 @@
1/*
2 * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DIR2_H__
19#define __XFS_DIR2_H__
20
21struct xfs_bmap_free;
22struct xfs_da_args;
23struct xfs_inode;
24struct xfs_mount;
25struct xfs_trans;
26struct xfs_dir2_sf_hdr;
27struct xfs_dir2_sf_entry;
28struct xfs_dir2_data_hdr;
29struct xfs_dir2_data_entry;
30struct xfs_dir2_data_unused;
31
32extern struct xfs_name xfs_name_dotdot;
33
34/*
35 * directory operations vector for encode/decode routines
36 */
37struct xfs_dir_ops {
38 int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
39 struct xfs_dir2_sf_entry *
40 (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
41 struct xfs_dir2_sf_entry *sfep);
42 __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
43 void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
44 __uint8_t ftype);
45 xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
46 struct xfs_dir2_sf_entry *sfep);
47 void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
48 struct xfs_dir2_sf_entry *sfep,
49 xfs_ino_t ino);
50 xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
51 void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
52 xfs_ino_t ino);
53
54 int (*data_entsize)(int len);
55 __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
56 void (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
57 __uint8_t ftype);
58 __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
59 struct xfs_dir2_data_free *
60 (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
61
62 xfs_dir2_data_aoff_t data_dot_offset;
63 xfs_dir2_data_aoff_t data_dotdot_offset;
64 xfs_dir2_data_aoff_t data_first_offset;
65 size_t data_entry_offset;
66
67 struct xfs_dir2_data_entry *
68 (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
69 struct xfs_dir2_data_entry *
70 (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
71 struct xfs_dir2_data_entry *
72 (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
73 struct xfs_dir2_data_entry *
74 (*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
75 struct xfs_dir2_data_unused *
76 (*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
77
78 int leaf_hdr_size;
79 void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
80 struct xfs_dir3_icleaf_hdr *from);
81 void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
82 struct xfs_dir2_leaf *from);
83 int (*leaf_max_ents)(struct xfs_da_geometry *geo);
84 struct xfs_dir2_leaf_entry *
85 (*leaf_ents_p)(struct xfs_dir2_leaf *lp);
86
87 int node_hdr_size;
88 void (*node_hdr_to_disk)(struct xfs_da_intnode *to,
89 struct xfs_da3_icnode_hdr *from);
90 void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
91 struct xfs_da_intnode *from);
92 struct xfs_da_node_entry *
93 (*node_tree_p)(struct xfs_da_intnode *dap);
94
95 int free_hdr_size;
96 void (*free_hdr_to_disk)(struct xfs_dir2_free *to,
97 struct xfs_dir3_icfree_hdr *from);
98 void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
99 struct xfs_dir2_free *from);
100 int (*free_max_bests)(struct xfs_da_geometry *geo);
101 __be16 * (*free_bests_p)(struct xfs_dir2_free *free);
102 xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
103 xfs_dir2_db_t db);
104 int (*db_to_fdindex)(struct xfs_da_geometry *geo,
105 xfs_dir2_db_t db);
106};
107
108extern const struct xfs_dir_ops *
109 xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
110extern const struct xfs_dir_ops *
111 xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
112
113/*
114 * Generic directory interface routines
115 */
116extern void xfs_dir_startup(void);
117extern int xfs_da_mount(struct xfs_mount *mp);
118extern void xfs_da_unmount(struct xfs_mount *mp);
119
120extern int xfs_dir_isempty(struct xfs_inode *dp);
121extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
122 struct xfs_inode *pdp);
123extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
124 struct xfs_name *name, xfs_ino_t inum,
125 xfs_fsblock_t *first,
126 struct xfs_bmap_free *flist, xfs_extlen_t tot);
127extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
128 struct xfs_name *name, xfs_ino_t *inum,
129 struct xfs_name *ci_name);
130extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
131 struct xfs_name *name, xfs_ino_t ino,
132 xfs_fsblock_t *first,
133 struct xfs_bmap_free *flist, xfs_extlen_t tot);
134extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
135 struct xfs_name *name, xfs_ino_t inum,
136 xfs_fsblock_t *first,
137 struct xfs_bmap_free *flist, xfs_extlen_t tot);
138extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
139 struct xfs_name *name, uint resblks);
140
141/*
142 * Direct call from the bmap code, bypassing the generic directory layer.
143 */
144extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
145
146/*
147 * Interface routines used by userspace utilities
148 */
149extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
150extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
151extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
152 struct xfs_buf *bp);
153
154extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
155 struct xfs_dir2_data_hdr *hdr, int *loghead);
156extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
157 struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
158extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
159 struct xfs_buf *bp);
160extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
161 struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
162extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
163 struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
164 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
165extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
166 struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
167 xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
168 int *needlogp, int *needscanp);
169
170extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
171 struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
172 struct xfs_dir2_data_unused *dup);
173
174extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
175extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
176extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
177extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
178extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
179
180#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
new file mode 100644
index 000000000000..9628ceccfa02
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -0,0 +1,1265 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27#include "xfs_da_format.h"
28#include "xfs_da_btree.h"
29#include "xfs_inode.h"
30#include "xfs_trans.h"
31#include "xfs_inode_item.h"
32#include "xfs_bmap.h"
33#include "xfs_buf_item.h"
34#include "xfs_dir2.h"
35#include "xfs_dir2_priv.h"
36#include "xfs_error.h"
37#include "xfs_trace.h"
38#include "xfs_cksum.h"
39#include "xfs_dinode.h"
40
41/*
42 * Local function prototypes.
43 */
44static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
45 int first, int last);
46static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
47static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
48 int *entno);
49static int xfs_dir2_block_sort(const void *a, const void *b);
50
51static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
52
53/*
54 * One-time startup routine called from xfs_init().
55 */
56void
57xfs_dir_startup(void)
58{
59 xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
60 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
61}
62
63static bool
64xfs_dir3_block_verify(
65 struct xfs_buf *bp)
66{
67 struct xfs_mount *mp = bp->b_target->bt_mount;
68 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
69
70 if (xfs_sb_version_hascrc(&mp->m_sb)) {
71 if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
72 return false;
73 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
74 return false;
75 if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
76 return false;
77 } else {
78 if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
79 return false;
80 }
81 if (__xfs_dir3_data_check(NULL, bp))
82 return false;
83 return true;
84}
85
86static void
87xfs_dir3_block_read_verify(
88 struct xfs_buf *bp)
89{
90 struct xfs_mount *mp = bp->b_target->bt_mount;
91
92 if (xfs_sb_version_hascrc(&mp->m_sb) &&
93 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
94 xfs_buf_ioerror(bp, -EFSBADCRC);
95 else if (!xfs_dir3_block_verify(bp))
96 xfs_buf_ioerror(bp, -EFSCORRUPTED);
97
98 if (bp->b_error)
99 xfs_verifier_error(bp);
100}
101
102static void
103xfs_dir3_block_write_verify(
104 struct xfs_buf *bp)
105{
106 struct xfs_mount *mp = bp->b_target->bt_mount;
107 struct xfs_buf_log_item *bip = bp->b_fspriv;
108 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
109
110 if (!xfs_dir3_block_verify(bp)) {
111 xfs_buf_ioerror(bp, -EFSCORRUPTED);
112 xfs_verifier_error(bp);
113 return;
114 }
115
116 if (!xfs_sb_version_hascrc(&mp->m_sb))
117 return;
118
119 if (bip)
120 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
121
122 xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
123}
124
125const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
126 .verify_read = xfs_dir3_block_read_verify,
127 .verify_write = xfs_dir3_block_write_verify,
128};
129
130int
131xfs_dir3_block_read(
132 struct xfs_trans *tp,
133 struct xfs_inode *dp,
134 struct xfs_buf **bpp)
135{
136 struct xfs_mount *mp = dp->i_mount;
137 int err;
138
139 err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
140 XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
141 if (!err && tp)
142 xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
143 return err;
144}
145
146static void
147xfs_dir3_block_init(
148 struct xfs_mount *mp,
149 struct xfs_trans *tp,
150 struct xfs_buf *bp,
151 struct xfs_inode *dp)
152{
153 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
154
155 bp->b_ops = &xfs_dir3_block_buf_ops;
156 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
157
158 if (xfs_sb_version_hascrc(&mp->m_sb)) {
159 memset(hdr3, 0, sizeof(*hdr3));
160 hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
161 hdr3->blkno = cpu_to_be64(bp->b_bn);
162 hdr3->owner = cpu_to_be64(dp->i_ino);
163 uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
164 return;
165
166 }
167 hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
168}
169
170static void
171xfs_dir2_block_need_space(
172 struct xfs_inode *dp,
173 struct xfs_dir2_data_hdr *hdr,
174 struct xfs_dir2_block_tail *btp,
175 struct xfs_dir2_leaf_entry *blp,
176 __be16 **tagpp,
177 struct xfs_dir2_data_unused **dupp,
178 struct xfs_dir2_data_unused **enddupp,
179 int *compact,
180 int len)
181{
182 struct xfs_dir2_data_free *bf;
183 __be16 *tagp = NULL;
184 struct xfs_dir2_data_unused *dup = NULL;
185 struct xfs_dir2_data_unused *enddup = NULL;
186
187 *compact = 0;
188 bf = dp->d_ops->data_bestfree_p(hdr);
189
190 /*
191 * If there are stale entries we'll use one for the leaf.
192 */
193 if (btp->stale) {
194 if (be16_to_cpu(bf[0].length) >= len) {
195 /*
196 * The biggest entry enough to avoid compaction.
197 */
198 dup = (xfs_dir2_data_unused_t *)
199 ((char *)hdr + be16_to_cpu(bf[0].offset));
200 goto out;
201 }
202
203 /*
204 * Will need to compact to make this work.
205 * Tag just before the first leaf entry.
206 */
207 *compact = 1;
208 tagp = (__be16 *)blp - 1;
209
210 /* Data object just before the first leaf entry. */
211 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
212
213 /*
214 * If it's not free then the data will go where the
215 * leaf data starts now, if it works at all.
216 */
217 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
218 if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
219 (uint)sizeof(*blp) < len)
220 dup = NULL;
221 } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
222 dup = NULL;
223 else
224 dup = (xfs_dir2_data_unused_t *)blp;
225 goto out;
226 }
227
228 /*
229 * no stale entries, so just use free space.
230 * Tag just before the first leaf entry.
231 */
232 tagp = (__be16 *)blp - 1;
233
234 /* Data object just before the first leaf entry. */
235 enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
236
237 /*
238 * If it's not free then can't do this add without cleaning up:
239 * the space before the first leaf entry needs to be free so it
240 * can be expanded to hold the pointer to the new entry.
241 */
242 if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
243 /*
244 * Check out the biggest freespace and see if it's the same one.
245 */
246 dup = (xfs_dir2_data_unused_t *)
247 ((char *)hdr + be16_to_cpu(bf[0].offset));
248 if (dup != enddup) {
249 /*
250 * Not the same free entry, just check its length.
251 */
252 if (be16_to_cpu(dup->length) < len)
253 dup = NULL;
254 goto out;
255 }
256
257 /*
258 * It is the biggest freespace, can it hold the leaf too?
259 */
260 if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
261 /*
262 * Yes, use the second-largest entry instead if it works.
263 */
264 if (be16_to_cpu(bf[1].length) >= len)
265 dup = (xfs_dir2_data_unused_t *)
266 ((char *)hdr + be16_to_cpu(bf[1].offset));
267 else
268 dup = NULL;
269 }
270 }
271out:
272 *tagpp = tagp;
273 *dupp = dup;
274 *enddupp = enddup;
275}
276
277/*
278 * compact the leaf entries.
279 * Leave the highest-numbered stale entry stale.
280 * XXX should be the one closest to mid but mid is not yet computed.
281 */
282static void
283xfs_dir2_block_compact(
284 struct xfs_da_args *args,
285 struct xfs_buf *bp,
286 struct xfs_dir2_data_hdr *hdr,
287 struct xfs_dir2_block_tail *btp,
288 struct xfs_dir2_leaf_entry *blp,
289 int *needlog,
290 int *lfloghigh,
291 int *lfloglow)
292{
293 int fromidx; /* source leaf index */
294 int toidx; /* target leaf index */
295 int needscan = 0;
296 int highstale; /* high stale index */
297
298 fromidx = toidx = be32_to_cpu(btp->count) - 1;
299 highstale = *lfloghigh = -1;
300 for (; fromidx >= 0; fromidx--) {
301 if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
302 if (highstale == -1)
303 highstale = toidx;
304 else {
305 if (*lfloghigh == -1)
306 *lfloghigh = toidx;
307 continue;
308 }
309 }
310 if (fromidx < toidx)
311 blp[toidx] = blp[fromidx];
312 toidx--;
313 }
314 *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
315 *lfloghigh -= be32_to_cpu(btp->stale) - 1;
316 be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
317 xfs_dir2_data_make_free(args, bp,
318 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
319 (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
320 needlog, &needscan);
321 btp->stale = cpu_to_be32(1);
322 /*
323 * If we now need to rebuild the bestfree map, do so.
324 * This needs to happen before the next call to use_free.
325 */
326 if (needscan)
327 xfs_dir2_data_freescan(args->dp, hdr, needlog);
328}
329
330/*
331 * Add an entry to a block directory.
332 */
333int /* error */
334xfs_dir2_block_addname(
335 xfs_da_args_t *args) /* directory op arguments */
336{
337 xfs_dir2_data_hdr_t *hdr; /* block header */
338 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
339 struct xfs_buf *bp; /* buffer for block */
340 xfs_dir2_block_tail_t *btp; /* block tail */
341 int compact; /* need to compact leaf ents */
342 xfs_dir2_data_entry_t *dep; /* block data entry */
343 xfs_inode_t *dp; /* directory inode */
344 xfs_dir2_data_unused_t *dup; /* block unused entry */
345 int error; /* error return value */
346 xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */
347 xfs_dahash_t hash; /* hash value of found entry */
348 int high; /* high index for binary srch */
349 int highstale; /* high stale index */
350 int lfloghigh=0; /* last final leaf to log */
351 int lfloglow=0; /* first final leaf to log */
352 int len; /* length of the new entry */
353 int low; /* low index for binary srch */
354 int lowstale; /* low stale index */
355 int mid=0; /* midpoint for binary srch */
356 xfs_mount_t *mp; /* filesystem mount point */
357 int needlog; /* need to log header */
358 int needscan; /* need to rescan freespace */
359 __be16 *tagp; /* pointer to tag value */
360 xfs_trans_t *tp; /* transaction structure */
361
362 trace_xfs_dir2_block_addname(args);
363
364 dp = args->dp;
365 tp = args->trans;
366 mp = dp->i_mount;
367
368 /* Read the (one and only) directory block into bp. */
369 error = xfs_dir3_block_read(tp, dp, &bp);
370 if (error)
371 return error;
372
373 len = dp->d_ops->data_entsize(args->namelen);
374
375 /*
376 * Set up pointers to parts of the block.
377 */
378 hdr = bp->b_addr;
379 btp = xfs_dir2_block_tail_p(args->geo, hdr);
380 blp = xfs_dir2_block_leaf_p(btp);
381
382 /*
383 * Find out if we can reuse stale entries or whether we need extra
384 * space for entry and new leaf.
385 */
386 xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
387 &enddup, &compact, len);
388
389 /*
390 * Done everything we need for a space check now.
391 */
392 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
393 xfs_trans_brelse(tp, bp);
394 if (!dup)
395 return -ENOSPC;
396 return 0;
397 }
398
399 /*
400 * If we don't have space for the new entry & leaf ...
401 */
402 if (!dup) {
403 /* Don't have a space reservation: return no-space. */
404 if (args->total == 0)
405 return -ENOSPC;
406 /*
407 * Convert to the next larger format.
408 * Then add the new entry in that format.
409 */
410 error = xfs_dir2_block_to_leaf(args, bp);
411 if (error)
412 return error;
413 return xfs_dir2_leaf_addname(args);
414 }
415
416 needlog = needscan = 0;
417
418 /*
419 * If need to compact the leaf entries, do it now.
420 */
421 if (compact) {
422 xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
423 &lfloghigh, &lfloglow);
424 /* recalculate blp post-compaction */
425 blp = xfs_dir2_block_leaf_p(btp);
426 } else if (btp->stale) {
427 /*
428 * Set leaf logging boundaries to impossible state.
429 * For the no-stale case they're set explicitly.
430 */
431 lfloglow = be32_to_cpu(btp->count);
432 lfloghigh = -1;
433 }
434
435 /*
436 * Find the slot that's first lower than our hash value, -1 if none.
437 */
438 for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
439 mid = (low + high) >> 1;
440 if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
441 break;
442 if (hash < args->hashval)
443 low = mid + 1;
444 else
445 high = mid - 1;
446 }
447 while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
448 mid--;
449 }
450 /*
451 * No stale entries, will use enddup space to hold new leaf.
452 */
453 if (!btp->stale) {
454 /*
455 * Mark the space needed for the new leaf entry, now in use.
456 */
457 xfs_dir2_data_use_free(args, bp, enddup,
458 (xfs_dir2_data_aoff_t)
459 ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
460 sizeof(*blp)),
461 (xfs_dir2_data_aoff_t)sizeof(*blp),
462 &needlog, &needscan);
463 /*
464 * Update the tail (entry count).
465 */
466 be32_add_cpu(&btp->count, 1);
467 /*
468 * If we now need to rebuild the bestfree map, do so.
469 * This needs to happen before the next call to use_free.
470 */
471 if (needscan) {
472 xfs_dir2_data_freescan(dp, hdr, &needlog);
473 needscan = 0;
474 }
475 /*
476 * Adjust pointer to the first leaf entry, we're about to move
477 * the table up one to open up space for the new leaf entry.
478 * Then adjust our index to match.
479 */
480 blp--;
481 mid++;
482 if (mid)
483 memmove(blp, &blp[1], mid * sizeof(*blp));
484 lfloglow = 0;
485 lfloghigh = mid;
486 }
487 /*
488 * Use a stale leaf for our new entry.
489 */
490 else {
491 for (lowstale = mid;
492 lowstale >= 0 &&
493 blp[lowstale].address !=
494 cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
495 lowstale--)
496 continue;
497 for (highstale = mid + 1;
498 highstale < be32_to_cpu(btp->count) &&
499 blp[highstale].address !=
500 cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
501 (lowstale < 0 || mid - lowstale > highstale - mid);
502 highstale++)
503 continue;
504 /*
505 * Move entries toward the low-numbered stale entry.
506 */
507 if (lowstale >= 0 &&
508 (highstale == be32_to_cpu(btp->count) ||
509 mid - lowstale <= highstale - mid)) {
510 if (mid - lowstale)
511 memmove(&blp[lowstale], &blp[lowstale + 1],
512 (mid - lowstale) * sizeof(*blp));
513 lfloglow = MIN(lowstale, lfloglow);
514 lfloghigh = MAX(mid, lfloghigh);
515 }
516 /*
517 * Move entries toward the high-numbered stale entry.
518 */
519 else {
520 ASSERT(highstale < be32_to_cpu(btp->count));
521 mid++;
522 if (highstale - mid)
523 memmove(&blp[mid + 1], &blp[mid],
524 (highstale - mid) * sizeof(*blp));
525 lfloglow = MIN(mid, lfloglow);
526 lfloghigh = MAX(highstale, lfloghigh);
527 }
528 be32_add_cpu(&btp->stale, -1);
529 }
530 /*
531 * Point to the new data entry.
532 */
533 dep = (xfs_dir2_data_entry_t *)dup;
534 /*
535 * Fill in the leaf entry.
536 */
537 blp[mid].hashval = cpu_to_be32(args->hashval);
538 blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
539 (char *)dep - (char *)hdr));
540 xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
541 /*
542 * Mark space for the data entry used.
543 */
544 xfs_dir2_data_use_free(args, bp, dup,
545 (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
546 (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
547 /*
548 * Create the new data entry.
549 */
550 dep->inumber = cpu_to_be64(args->inumber);
551 dep->namelen = args->namelen;
552 memcpy(dep->name, args->name, args->namelen);
553 dp->d_ops->data_put_ftype(dep, args->filetype);
554 tagp = dp->d_ops->data_entry_tag_p(dep);
555 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
556 /*
557 * Clean up the bestfree array and log the header, tail, and entry.
558 */
559 if (needscan)
560 xfs_dir2_data_freescan(dp, hdr, &needlog);
561 if (needlog)
562 xfs_dir2_data_log_header(args, bp);
563 xfs_dir2_block_log_tail(tp, bp);
564 xfs_dir2_data_log_entry(args, bp, dep);
565 xfs_dir3_data_check(dp, bp);
566 return 0;
567}
568
569/*
570 * Log leaf entries from the block.
571 */
572static void
573xfs_dir2_block_log_leaf(
574 xfs_trans_t *tp, /* transaction structure */
575 struct xfs_buf *bp, /* block buffer */
576 int first, /* index of first logged leaf */
577 int last) /* index of last logged leaf */
578{
579 xfs_dir2_data_hdr_t *hdr = bp->b_addr;
580 xfs_dir2_leaf_entry_t *blp;
581 xfs_dir2_block_tail_t *btp;
582
583 btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
584 blp = xfs_dir2_block_leaf_p(btp);
585 xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
586 (uint)((char *)&blp[last + 1] - (char *)hdr - 1));
587}
588
589/*
590 * Log the block tail.
591 */
592static void
593xfs_dir2_block_log_tail(
594 xfs_trans_t *tp, /* transaction structure */
595 struct xfs_buf *bp) /* block buffer */
596{
597 xfs_dir2_data_hdr_t *hdr = bp->b_addr;
598 xfs_dir2_block_tail_t *btp;
599
600 btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
601 xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
602 (uint)((char *)(btp + 1) - (char *)hdr - 1));
603}
604
605/*
606 * Look up an entry in the block. This is the external routine,
607 * xfs_dir2_block_lookup_int does the real work.
608 */
609int /* error */
610xfs_dir2_block_lookup(
611 xfs_da_args_t *args) /* dir lookup arguments */
612{
613 xfs_dir2_data_hdr_t *hdr; /* block header */
614 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
615 struct xfs_buf *bp; /* block buffer */
616 xfs_dir2_block_tail_t *btp; /* block tail */
617 xfs_dir2_data_entry_t *dep; /* block data entry */
618 xfs_inode_t *dp; /* incore inode */
619 int ent; /* entry index */
620 int error; /* error return value */
621 xfs_mount_t *mp; /* filesystem mount point */
622
623 trace_xfs_dir2_block_lookup(args);
624
625 /*
626 * Get the buffer, look up the entry.
627 * If not found (ENOENT) then return, have no buffer.
628 */
629 if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
630 return error;
631 dp = args->dp;
632 mp = dp->i_mount;
633 hdr = bp->b_addr;
634 xfs_dir3_data_check(dp, bp);
635 btp = xfs_dir2_block_tail_p(args->geo, hdr);
636 blp = xfs_dir2_block_leaf_p(btp);
637 /*
638 * Get the offset from the leaf entry, to point to the data.
639 */
640 dep = (xfs_dir2_data_entry_t *)((char *)hdr +
641 xfs_dir2_dataptr_to_off(args->geo,
642 be32_to_cpu(blp[ent].address)));
643 /*
644 * Fill in inode number, CI name if appropriate, release the block.
645 */
646 args->inumber = be64_to_cpu(dep->inumber);
647 args->filetype = dp->d_ops->data_get_ftype(dep);
648 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
649 xfs_trans_brelse(args->trans, bp);
650 return error;
651}
652
653/*
654 * Internal block lookup routine.
655 */
656static int /* error */
657xfs_dir2_block_lookup_int(
658 xfs_da_args_t *args, /* dir lookup arguments */
659 struct xfs_buf **bpp, /* returned block buffer */
660 int *entno) /* returned entry number */
661{
662 xfs_dir2_dataptr_t addr; /* data entry address */
663 xfs_dir2_data_hdr_t *hdr; /* block header */
664 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
665 struct xfs_buf *bp; /* block buffer */
666 xfs_dir2_block_tail_t *btp; /* block tail */
667 xfs_dir2_data_entry_t *dep; /* block data entry */
668 xfs_inode_t *dp; /* incore inode */
669 int error; /* error return value */
670 xfs_dahash_t hash; /* found hash value */
671 int high; /* binary search high index */
672 int low; /* binary search low index */
673 int mid; /* binary search current idx */
674 xfs_mount_t *mp; /* filesystem mount point */
675 xfs_trans_t *tp; /* transaction pointer */
676 enum xfs_dacmp cmp; /* comparison result */
677
678 dp = args->dp;
679 tp = args->trans;
680 mp = dp->i_mount;
681
682 error = xfs_dir3_block_read(tp, dp, &bp);
683 if (error)
684 return error;
685
686 hdr = bp->b_addr;
687 xfs_dir3_data_check(dp, bp);
688 btp = xfs_dir2_block_tail_p(args->geo, hdr);
689 blp = xfs_dir2_block_leaf_p(btp);
690 /*
691 * Loop doing a binary search for our hash value.
692 * Find our entry, ENOENT if it's not there.
693 */
694 for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
695 ASSERT(low <= high);
696 mid = (low + high) >> 1;
697 if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
698 break;
699 if (hash < args->hashval)
700 low = mid + 1;
701 else
702 high = mid - 1;
703 if (low > high) {
704 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
705 xfs_trans_brelse(tp, bp);
706 return -ENOENT;
707 }
708 }
709 /*
710 * Back up to the first one with the right hash value.
711 */
712 while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
713 mid--;
714 }
715 /*
716 * Now loop forward through all the entries with the
717 * right hash value looking for our name.
718 */
719 do {
720 if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
721 continue;
722 /*
723 * Get pointer to the entry from the leaf.
724 */
725 dep = (xfs_dir2_data_entry_t *)
726 ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
727 /*
728 * Compare name and if it's an exact match, return the index
729 * and buffer. If it's the first case-insensitive match, store
730 * the index and buffer and continue looking for an exact match.
731 */
732 cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
733 if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
734 args->cmpresult = cmp;
735 *bpp = bp;
736 *entno = mid;
737 if (cmp == XFS_CMP_EXACT)
738 return 0;
739 }
740 } while (++mid < be32_to_cpu(btp->count) &&
741 be32_to_cpu(blp[mid].hashval) == hash);
742
743 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
744 /*
745 * Here, we can only be doing a lookup (not a rename or replace).
746 * If a case-insensitive match was found earlier, return success.
747 */
748 if (args->cmpresult == XFS_CMP_CASE)
749 return 0;
750 /*
751 * No match, release the buffer and return ENOENT.
752 */
753 xfs_trans_brelse(tp, bp);
754 return -ENOENT;
755}
756
757/*
758 * Remove an entry from a block format directory.
759 * If that makes the block small enough to fit in shortform, transform it.
760 */
761int /* error */
762xfs_dir2_block_removename(
763 xfs_da_args_t *args) /* directory operation args */
764{
765 xfs_dir2_data_hdr_t *hdr; /* block header */
766 xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */
767 struct xfs_buf *bp; /* block buffer */
768 xfs_dir2_block_tail_t *btp; /* block tail */
769 xfs_dir2_data_entry_t *dep; /* block data entry */
770 xfs_inode_t *dp; /* incore inode */
771 int ent; /* block leaf entry index */
772 int error; /* error return value */
773 xfs_mount_t *mp; /* filesystem mount point */
774 int needlog; /* need to log block header */
775 int needscan; /* need to fixup bestfree */
776 xfs_dir2_sf_hdr_t sfh; /* shortform header */
777 int size; /* shortform size */
778 xfs_trans_t *tp; /* transaction pointer */
779
780 trace_xfs_dir2_block_removename(args);
781
782 /*
783 * Look up the entry in the block. Gets the buffer and entry index.
784 * It will always be there, the vnodeops level does a lookup first.
785 */
786 if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
787 return error;
788 }
789 dp = args->dp;
790 tp = args->trans;
791 mp = dp->i_mount;
792 hdr = bp->b_addr;
793 btp = xfs_dir2_block_tail_p(args->geo, hdr);
794 blp = xfs_dir2_block_leaf_p(btp);
795 /*
796 * Point to the data entry using the leaf entry.
797 */
798 dep = (xfs_dir2_data_entry_t *)((char *)hdr +
799 xfs_dir2_dataptr_to_off(args->geo,
800 be32_to_cpu(blp[ent].address)));
801 /*
802 * Mark the data entry's space free.
803 */
804 needlog = needscan = 0;
805 xfs_dir2_data_make_free(args, bp,
806 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
807 dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
808 /*
809 * Fix up the block tail.
810 */
811 be32_add_cpu(&btp->stale, 1);
812 xfs_dir2_block_log_tail(tp, bp);
813 /*
814 * Remove the leaf entry by marking it stale.
815 */
816 blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
817 xfs_dir2_block_log_leaf(tp, bp, ent, ent);
818 /*
819 * Fix up bestfree, log the header if necessary.
820 */
821 if (needscan)
822 xfs_dir2_data_freescan(dp, hdr, &needlog);
823 if (needlog)
824 xfs_dir2_data_log_header(args, bp);
825 xfs_dir3_data_check(dp, bp);
826 /*
827 * See if the size as a shortform is good enough.
828 */
829 size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
830 if (size > XFS_IFORK_DSIZE(dp))
831 return 0;
832
833 /*
834 * If it works, do the conversion.
835 */
836 return xfs_dir2_block_to_sf(args, bp, size, &sfh);
837}
838
839/*
840 * Replace an entry in a V2 block directory.
841 * Change the inode number to the new value.
842 */
843int /* error */
844xfs_dir2_block_replace(
845 xfs_da_args_t *args) /* directory operation args */
846{
847 xfs_dir2_data_hdr_t *hdr; /* block header */
848 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
849 struct xfs_buf *bp; /* block buffer */
850 xfs_dir2_block_tail_t *btp; /* block tail */
851 xfs_dir2_data_entry_t *dep; /* block data entry */
852 xfs_inode_t *dp; /* incore inode */
853 int ent; /* leaf entry index */
854 int error; /* error return value */
855 xfs_mount_t *mp; /* filesystem mount point */
856
857 trace_xfs_dir2_block_replace(args);
858
859 /*
860 * Lookup the entry in the directory. Get buffer and entry index.
861 * This will always succeed since the caller has already done a lookup.
862 */
863 if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
864 return error;
865 }
866 dp = args->dp;
867 mp = dp->i_mount;
868 hdr = bp->b_addr;
869 btp = xfs_dir2_block_tail_p(args->geo, hdr);
870 blp = xfs_dir2_block_leaf_p(btp);
871 /*
872 * Point to the data entry we need to change.
873 */
874 dep = (xfs_dir2_data_entry_t *)((char *)hdr +
875 xfs_dir2_dataptr_to_off(args->geo,
876 be32_to_cpu(blp[ent].address)));
877 ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
878 /*
879 * Change the inode number to the new value.
880 */
881 dep->inumber = cpu_to_be64(args->inumber);
882 dp->d_ops->data_put_ftype(dep, args->filetype);
883 xfs_dir2_data_log_entry(args, bp, dep);
884 xfs_dir3_data_check(dp, bp);
885 return 0;
886}
887
888/*
889 * Qsort comparison routine for the block leaf entries.
890 */
891static int /* sort order */
892xfs_dir2_block_sort(
893 const void *a, /* first leaf entry */
894 const void *b) /* second leaf entry */
895{
896 const xfs_dir2_leaf_entry_t *la; /* first leaf entry */
897 const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */
898
899 la = a;
900 lb = b;
901 return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
902 (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
903}
904
905/*
906 * Convert a V2 leaf directory to a V2 block directory if possible.
907 */
908int /* error */
909xfs_dir2_leaf_to_block(
910 xfs_da_args_t *args, /* operation arguments */
911 struct xfs_buf *lbp, /* leaf buffer */
912 struct xfs_buf *dbp) /* data buffer */
913{
914 __be16 *bestsp; /* leaf bests table */
915 xfs_dir2_data_hdr_t *hdr; /* block header */
916 xfs_dir2_block_tail_t *btp; /* block tail */
917 xfs_inode_t *dp; /* incore directory inode */
918 xfs_dir2_data_unused_t *dup; /* unused data entry */
919 int error; /* error return value */
920 int from; /* leaf from index */
921 xfs_dir2_leaf_t *leaf; /* leaf structure */
922 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
923 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
924 xfs_mount_t *mp; /* file system mount point */
925 int needlog; /* need to log data header */
926 int needscan; /* need to scan for bestfree */
927 xfs_dir2_sf_hdr_t sfh; /* shortform header */
928 int size; /* bytes used */
929 __be16 *tagp; /* end of entry (tag) */
930 int to; /* block/leaf to index */
931 xfs_trans_t *tp; /* transaction pointer */
932 struct xfs_dir2_leaf_entry *ents;
933 struct xfs_dir3_icleaf_hdr leafhdr;
934
935 trace_xfs_dir2_leaf_to_block(args);
936
937 dp = args->dp;
938 tp = args->trans;
939 mp = dp->i_mount;
940 leaf = lbp->b_addr;
941 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
942 ents = dp->d_ops->leaf_ents_p(leaf);
943 ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
944
945 ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
946 leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
947 /*
948 * If there are data blocks other than the first one, take this
949 * opportunity to remove trailing empty data blocks that may have
950 * been left behind during no-space-reservation operations.
951 * These will show up in the leaf bests table.
952 */
953 while (dp->i_d.di_size > args->geo->blksize) {
954 int hdrsz;
955
956 hdrsz = dp->d_ops->data_entry_offset;
957 bestsp = xfs_dir2_leaf_bests_p(ltp);
958 if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
959 args->geo->blksize - hdrsz) {
960 if ((error =
961 xfs_dir2_leaf_trim_data(args, lbp,
962 (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
963 return error;
964 } else
965 return 0;
966 }
967 /*
968 * Read the data block if we don't already have it, give up if it fails.
969 */
970 if (!dbp) {
971 error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
972 if (error)
973 return error;
974 }
975 hdr = dbp->b_addr;
976 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
977 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
978
979 /*
980 * Size of the "leaf" area in the block.
981 */
982 size = (uint)sizeof(xfs_dir2_block_tail_t) +
983 (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
984 /*
985 * Look at the last data entry.
986 */
987 tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
988 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
989 /*
990 * If it's not free or is too short we can't do it.
991 */
992 if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
993 be16_to_cpu(dup->length) < size)
994 return 0;
995
996 /*
997 * Start converting it to block form.
998 */
999 xfs_dir3_block_init(mp, tp, dbp, dp);
1000
1001 needlog = 1;
1002 needscan = 0;
1003 /*
1004 * Use up the space at the end of the block (blp/btp).
1005 */
1006 xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
1007 &needlog, &needscan);
1008 /*
1009 * Initialize the block tail.
1010 */
1011 btp = xfs_dir2_block_tail_p(args->geo, hdr);
1012 btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
1013 btp->stale = 0;
1014 xfs_dir2_block_log_tail(tp, dbp);
1015 /*
1016 * Initialize the block leaf area. We compact out stale entries.
1017 */
1018 lep = xfs_dir2_block_leaf_p(btp);
1019 for (from = to = 0; from < leafhdr.count; from++) {
1020 if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
1021 continue;
1022 lep[to++] = ents[from];
1023 }
1024 ASSERT(to == be32_to_cpu(btp->count));
1025 xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
1026 /*
1027 * Scan the bestfree if we need it and log the data block header.
1028 */
1029 if (needscan)
1030 xfs_dir2_data_freescan(dp, hdr, &needlog);
1031 if (needlog)
1032 xfs_dir2_data_log_header(args, dbp);
1033 /*
1034 * Pitch the old leaf block.
1035 */
1036 error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
1037 if (error)
1038 return error;
1039
1040 /*
1041 * Now see if the resulting block can be shrunken to shortform.
1042 */
1043 size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
1044 if (size > XFS_IFORK_DSIZE(dp))
1045 return 0;
1046
1047 return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
1048}
1049
1050/*
1051 * Convert the shortform directory to block form.
1052 */
1053int /* error */
1054xfs_dir2_sf_to_block(
1055 xfs_da_args_t *args) /* operation arguments */
1056{
1057 xfs_dir2_db_t blkno; /* dir-relative block # (0) */
1058 xfs_dir2_data_hdr_t *hdr; /* block header */
1059 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
1060 struct xfs_buf *bp; /* block buffer */
1061 xfs_dir2_block_tail_t *btp; /* block tail pointer */
1062 xfs_dir2_data_entry_t *dep; /* data entry pointer */
1063 xfs_inode_t *dp; /* incore directory inode */
1064 int dummy; /* trash */
1065 xfs_dir2_data_unused_t *dup; /* unused entry pointer */
1066 int endoffset; /* end of data objects */
1067 int error; /* error return value */
1068 int i; /* index */
1069 xfs_mount_t *mp; /* filesystem mount point */
1070 int needlog; /* need to log block header */
1071 int needscan; /* need to scan block freespc */
1072 int newoffset; /* offset from current entry */
1073 int offset; /* target block offset */
1074 xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
1075 xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */
1076 xfs_dir2_sf_hdr_t *sfp; /* shortform header */
1077 __be16 *tagp; /* end of data entry */
1078 xfs_trans_t *tp; /* transaction pointer */
1079 struct xfs_name name;
1080 struct xfs_ifork *ifp;
1081
1082 trace_xfs_dir2_sf_to_block(args);
1083
1084 dp = args->dp;
1085 tp = args->trans;
1086 mp = dp->i_mount;
1087 ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
1088 ASSERT(ifp->if_flags & XFS_IFINLINE);
1089 /*
1090 * Bomb out if the shortform directory is way too short.
1091 */
1092 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
1093 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1094 return -EIO;
1095 }
1096
1097 oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
1098
1099 ASSERT(ifp->if_bytes == dp->i_d.di_size);
1100 ASSERT(ifp->if_u1.if_data != NULL);
1101 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
1102 ASSERT(dp->i_d.di_nextents == 0);
1103
1104 /*
1105 * Copy the directory into a temporary buffer.
1106 * Then pitch the incore inode data so we can make extents.
1107 */
1108 sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
1109 memcpy(sfp, oldsfp, ifp->if_bytes);
1110
1111 xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
1112 xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
1113 dp->i_d.di_size = 0;
1114
1115 /*
1116 * Add block 0 to the inode.
1117 */
1118 error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
1119 if (error) {
1120 kmem_free(sfp);
1121 return error;
1122 }
1123 /*
1124 * Initialize the data block, then convert it to block format.
1125 */
1126 error = xfs_dir3_data_init(args, blkno, &bp);
1127 if (error) {
1128 kmem_free(sfp);
1129 return error;
1130 }
1131 xfs_dir3_block_init(mp, tp, bp, dp);
1132 hdr = bp->b_addr;
1133
1134 /*
1135 * Compute size of block "tail" area.
1136 */
1137 i = (uint)sizeof(*btp) +
1138 (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
1139 /*
1140 * The whole thing is initialized to free by the init routine.
1141 * Say we're using the leaf and tail area.
1142 */
1143 dup = dp->d_ops->data_unused_p(hdr);
1144 needlog = needscan = 0;
1145 xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
1146 i, &needlog, &needscan);
1147 ASSERT(needscan == 0);
1148 /*
1149 * Fill in the tail.
1150 */
1151 btp = xfs_dir2_block_tail_p(args->geo, hdr);
1152 btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */
1153 btp->stale = 0;
1154 blp = xfs_dir2_block_leaf_p(btp);
1155 endoffset = (uint)((char *)blp - (char *)hdr);
1156 /*
1157 * Remove the freespace, we'll manage it.
1158 */
1159 xfs_dir2_data_use_free(args, bp, dup,
1160 (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
1161 be16_to_cpu(dup->length), &needlog, &needscan);
1162 /*
1163 * Create entry for .
1164 */
1165 dep = dp->d_ops->data_dot_entry_p(hdr);
1166 dep->inumber = cpu_to_be64(dp->i_ino);
1167 dep->namelen = 1;
1168 dep->name[0] = '.';
1169 dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
1170 tagp = dp->d_ops->data_entry_tag_p(dep);
1171 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1172 xfs_dir2_data_log_entry(args, bp, dep);
1173 blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
1174 blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
1175 (char *)dep - (char *)hdr));
1176 /*
1177 * Create entry for ..
1178 */
1179 dep = dp->d_ops->data_dotdot_entry_p(hdr);
1180 dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
1181 dep->namelen = 2;
1182 dep->name[0] = dep->name[1] = '.';
1183 dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
1184 tagp = dp->d_ops->data_entry_tag_p(dep);
1185 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1186 xfs_dir2_data_log_entry(args, bp, dep);
1187 blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
1188 blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
1189 (char *)dep - (char *)hdr));
1190 offset = dp->d_ops->data_first_offset;
1191 /*
1192 * Loop over existing entries, stuff them in.
1193 */
1194 i = 0;
1195 if (!sfp->count)
1196 sfep = NULL;
1197 else
1198 sfep = xfs_dir2_sf_firstentry(sfp);
1199 /*
1200 * Need to preserve the existing offset values in the sf directory.
1201 * Insert holes (unused entries) where necessary.
1202 */
1203 while (offset < endoffset) {
1204 /*
1205 * sfep is null when we reach the end of the list.
1206 */
1207 if (sfep == NULL)
1208 newoffset = endoffset;
1209 else
1210 newoffset = xfs_dir2_sf_get_offset(sfep);
1211 /*
1212 * There should be a hole here, make one.
1213 */
1214 if (offset < newoffset) {
1215 dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
1216 dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
1217 dup->length = cpu_to_be16(newoffset - offset);
1218 *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
1219 ((char *)dup - (char *)hdr));
1220 xfs_dir2_data_log_unused(args, bp, dup);
1221 xfs_dir2_data_freeinsert(hdr,
1222 dp->d_ops->data_bestfree_p(hdr),
1223 dup, &dummy);
1224 offset += be16_to_cpu(dup->length);
1225 continue;
1226 }
1227 /*
1228 * Copy a real entry.
1229 */
1230 dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
1231 dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
1232 dep->namelen = sfep->namelen;
1233 dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
1234 memcpy(dep->name, sfep->name, dep->namelen);
1235 tagp = dp->d_ops->data_entry_tag_p(dep);
1236 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1237 xfs_dir2_data_log_entry(args, bp, dep);
1238 name.name = sfep->name;
1239 name.len = sfep->namelen;
1240 blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
1241 hashname(&name));
1242 blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
1243 (char *)dep - (char *)hdr));
1244 offset = (int)((char *)(tagp + 1) - (char *)hdr);
1245 if (++i == sfp->count)
1246 sfep = NULL;
1247 else
1248 sfep = dp->d_ops->sf_nextentry(sfp, sfep);
1249 }
1250 /* Done with the temporary buffer */
1251 kmem_free(sfp);
1252 /*
1253 * Sort the leaf entries by hash value.
1254 */
1255 xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
1256 /*
1257 * Log the leaf entry area and tail.
1258 * Already logged the header in data_init, ignore needlog.
1259 */
1260 ASSERT(needscan == 0);
1261 xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
1262 xfs_dir2_block_log_tail(tp, bp);
1263 xfs_dir3_data_check(dp, bp);
1264 return 0;
1265}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
new file mode 100644
index 000000000000..fdd803fecb8e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -0,0 +1,1050 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27#include "xfs_da_format.h"
28#include "xfs_da_btree.h"
29#include "xfs_inode.h"
30#include "xfs_dir2.h"
31#include "xfs_dir2_priv.h"
32#include "xfs_error.h"
33#include "xfs_trans.h"
34#include "xfs_buf_item.h"
35#include "xfs_cksum.h"
36
37/*
38 * Check the consistency of the data block.
39 * The input can also be a block-format directory.
40 * Return 0 is the buffer is good, otherwise an error.
41 */
42int
43__xfs_dir3_data_check(
44 struct xfs_inode *dp, /* incore inode pointer */
45 struct xfs_buf *bp) /* data block's buffer */
46{
47 xfs_dir2_dataptr_t addr; /* addr for leaf lookup */
48 xfs_dir2_data_free_t *bf; /* bestfree table */
49 xfs_dir2_block_tail_t *btp=NULL; /* block tail */
50 int count; /* count of entries found */
51 xfs_dir2_data_hdr_t *hdr; /* data block header */
52 xfs_dir2_data_entry_t *dep; /* data entry */
53 xfs_dir2_data_free_t *dfp; /* bestfree entry */
54 xfs_dir2_data_unused_t *dup; /* unused entry */
55 char *endp; /* end of useful data */
56 int freeseen; /* mask of bestfrees seen */
57 xfs_dahash_t hash; /* hash of current name */
58 int i; /* leaf index */
59 int lastfree; /* last entry was unused */
60 xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */
61 xfs_mount_t *mp; /* filesystem mount point */
62 char *p; /* current data position */
63 int stale; /* count of stale leaves */
64 struct xfs_name name;
65 const struct xfs_dir_ops *ops;
66 struct xfs_da_geometry *geo;
67
68 mp = bp->b_target->bt_mount;
69 geo = mp->m_dir_geo;
70
71 /*
72 * We can be passed a null dp here from a verifier, so we need to go the
73 * hard way to get them.
74 */
75 ops = xfs_dir_get_ops(mp, dp);
76
77 hdr = bp->b_addr;
78 p = (char *)ops->data_entry_p(hdr);
79
80 switch (hdr->magic) {
81 case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
82 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
83 btp = xfs_dir2_block_tail_p(geo, hdr);
84 lep = xfs_dir2_block_leaf_p(btp);
85 endp = (char *)lep;
86
87 /*
88 * The number of leaf entries is limited by the size of the
89 * block and the amount of space used by the data entries.
90 * We don't know how much space is used by the data entries yet,
91 * so just ensure that the count falls somewhere inside the
92 * block right now.
93 */
94 XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
95 ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
96 break;
97 case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
98 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
99 endp = (char *)hdr + geo->blksize;
100 break;
101 default:
102 XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
103 return -EFSCORRUPTED;
104 }
105
106 /*
107 * Account for zero bestfree entries.
108 */
109 bf = ops->data_bestfree_p(hdr);
110 count = lastfree = freeseen = 0;
111 if (!bf[0].length) {
112 XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
113 freeseen |= 1 << 0;
114 }
115 if (!bf[1].length) {
116 XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
117 freeseen |= 1 << 1;
118 }
119 if (!bf[2].length) {
120 XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
121 freeseen |= 1 << 2;
122 }
123
124 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
125 be16_to_cpu(bf[1].length));
126 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
127 be16_to_cpu(bf[2].length));
128 /*
129 * Loop over the data/unused entries.
130 */
131 while (p < endp) {
132 dup = (xfs_dir2_data_unused_t *)p;
133 /*
134 * If it's unused, look for the space in the bestfree table.
135 * If we find it, account for that, else make sure it
136 * doesn't need to be there.
137 */
138 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
139 XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
140 XFS_WANT_CORRUPTED_RETURN(
141 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
142 (char *)dup - (char *)hdr);
143 dfp = xfs_dir2_data_freefind(hdr, bf, dup);
144 if (dfp) {
145 i = (int)(dfp - bf);
146 XFS_WANT_CORRUPTED_RETURN(
147 (freeseen & (1 << i)) == 0);
148 freeseen |= 1 << i;
149 } else {
150 XFS_WANT_CORRUPTED_RETURN(
151 be16_to_cpu(dup->length) <=
152 be16_to_cpu(bf[2].length));
153 }
154 p += be16_to_cpu(dup->length);
155 lastfree = 1;
156 continue;
157 }
158 /*
159 * It's a real entry. Validate the fields.
160 * If this is a block directory then make sure it's
161 * in the leaf section of the block.
162 * The linear search is crude but this is DEBUG code.
163 */
164 dep = (xfs_dir2_data_entry_t *)p;
165 XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
166 XFS_WANT_CORRUPTED_RETURN(
167 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
168 XFS_WANT_CORRUPTED_RETURN(
169 be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
170 (char *)dep - (char *)hdr);
171 XFS_WANT_CORRUPTED_RETURN(
172 ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
173 count++;
174 lastfree = 0;
175 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
176 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
177 addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
178 (xfs_dir2_data_aoff_t)
179 ((char *)dep - (char *)hdr));
180 name.name = dep->name;
181 name.len = dep->namelen;
182 hash = mp->m_dirnameops->hashname(&name);
183 for (i = 0; i < be32_to_cpu(btp->count); i++) {
184 if (be32_to_cpu(lep[i].address) == addr &&
185 be32_to_cpu(lep[i].hashval) == hash)
186 break;
187 }
188 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
189 }
190 p += ops->data_entsize(dep->namelen);
191 }
192 /*
193 * Need to have seen all the entries and all the bestfree slots.
194 */
195 XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
196 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
197 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
198 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
199 if (lep[i].address ==
200 cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
201 stale++;
202 if (i > 0)
203 XFS_WANT_CORRUPTED_RETURN(
204 be32_to_cpu(lep[i].hashval) >=
205 be32_to_cpu(lep[i - 1].hashval));
206 }
207 XFS_WANT_CORRUPTED_RETURN(count ==
208 be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
209 XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
210 }
211 return 0;
212}
213
214static bool
215xfs_dir3_data_verify(
216 struct xfs_buf *bp)
217{
218 struct xfs_mount *mp = bp->b_target->bt_mount;
219 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
220
221 if (xfs_sb_version_hascrc(&mp->m_sb)) {
222 if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
223 return false;
224 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
225 return false;
226 if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
227 return false;
228 } else {
229 if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
230 return false;
231 }
232 if (__xfs_dir3_data_check(NULL, bp))
233 return false;
234 return true;
235}
236
237/*
238 * Readahead of the first block of the directory when it is opened is completely
239 * oblivious to the format of the directory. Hence we can either get a block
240 * format buffer or a data format buffer on readahead.
241 */
242static void
243xfs_dir3_data_reada_verify(
244 struct xfs_buf *bp)
245{
246 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
247
248 switch (hdr->magic) {
249 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
250 case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
251 bp->b_ops = &xfs_dir3_block_buf_ops;
252 bp->b_ops->verify_read(bp);
253 return;
254 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
255 case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
256 xfs_dir3_data_verify(bp);
257 return;
258 default:
259 xfs_buf_ioerror(bp, -EFSCORRUPTED);
260 xfs_verifier_error(bp);
261 break;
262 }
263}
264
265static void
266xfs_dir3_data_read_verify(
267 struct xfs_buf *bp)
268{
269 struct xfs_mount *mp = bp->b_target->bt_mount;
270
271 if (xfs_sb_version_hascrc(&mp->m_sb) &&
272 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
273 xfs_buf_ioerror(bp, -EFSBADCRC);
274 else if (!xfs_dir3_data_verify(bp))
275 xfs_buf_ioerror(bp, -EFSCORRUPTED);
276
277 if (bp->b_error)
278 xfs_verifier_error(bp);
279}
280
281static void
282xfs_dir3_data_write_verify(
283 struct xfs_buf *bp)
284{
285 struct xfs_mount *mp = bp->b_target->bt_mount;
286 struct xfs_buf_log_item *bip = bp->b_fspriv;
287 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
288
289 if (!xfs_dir3_data_verify(bp)) {
290 xfs_buf_ioerror(bp, -EFSCORRUPTED);
291 xfs_verifier_error(bp);
292 return;
293 }
294
295 if (!xfs_sb_version_hascrc(&mp->m_sb))
296 return;
297
298 if (bip)
299 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
300
301 xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
302}
303
304const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
305 .verify_read = xfs_dir3_data_read_verify,
306 .verify_write = xfs_dir3_data_write_verify,
307};
308
309static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
310 .verify_read = xfs_dir3_data_reada_verify,
311 .verify_write = xfs_dir3_data_write_verify,
312};
313
314
315int
316xfs_dir3_data_read(
317 struct xfs_trans *tp,
318 struct xfs_inode *dp,
319 xfs_dablk_t bno,
320 xfs_daddr_t mapped_bno,
321 struct xfs_buf **bpp)
322{
323 int err;
324
325 err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
326 XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
327 if (!err && tp)
328 xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
329 return err;
330}
331
332int
333xfs_dir3_data_readahead(
334 struct xfs_inode *dp,
335 xfs_dablk_t bno,
336 xfs_daddr_t mapped_bno)
337{
338 return xfs_da_reada_buf(dp, bno, mapped_bno,
339 XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
340}
341
342/*
343 * Given a data block and an unused entry from that block,
344 * return the bestfree entry if any that corresponds to it.
345 */
346xfs_dir2_data_free_t *
347xfs_dir2_data_freefind(
348 struct xfs_dir2_data_hdr *hdr, /* data block header */
349 struct xfs_dir2_data_free *bf, /* bestfree table pointer */
350 struct xfs_dir2_data_unused *dup) /* unused space */
351{
352 xfs_dir2_data_free_t *dfp; /* bestfree entry */
353 xfs_dir2_data_aoff_t off; /* offset value needed */
354#ifdef DEBUG
355 int matched; /* matched the value */
356 int seenzero; /* saw a 0 bestfree entry */
357#endif
358
359 off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
360
361#ifdef DEBUG
362 /*
363 * Validate some consistency in the bestfree table.
364 * Check order, non-overlapping entries, and if we find the
365 * one we're looking for it has to be exact.
366 */
367 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
368 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
369 hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
370 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
371 for (dfp = &bf[0], seenzero = matched = 0;
372 dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
373 dfp++) {
374 if (!dfp->offset) {
375 ASSERT(!dfp->length);
376 seenzero = 1;
377 continue;
378 }
379 ASSERT(seenzero == 0);
380 if (be16_to_cpu(dfp->offset) == off) {
381 matched = 1;
382 ASSERT(dfp->length == dup->length);
383 } else if (off < be16_to_cpu(dfp->offset))
384 ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
385 else
386 ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
387 ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
388 if (dfp > &bf[0])
389 ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
390 }
391#endif
392 /*
393 * If this is smaller than the smallest bestfree entry,
394 * it can't be there since they're sorted.
395 */
396 if (be16_to_cpu(dup->length) <
397 be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
398 return NULL;
399 /*
400 * Look at the three bestfree entries for our guy.
401 */
402 for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
403 if (!dfp->offset)
404 return NULL;
405 if (be16_to_cpu(dfp->offset) == off)
406 return dfp;
407 }
408 /*
409 * Didn't find it. This only happens if there are duplicate lengths.
410 */
411 return NULL;
412}
413
414/*
415 * Insert an unused-space entry into the bestfree table.
416 */
417xfs_dir2_data_free_t * /* entry inserted */
418xfs_dir2_data_freeinsert(
419 struct xfs_dir2_data_hdr *hdr, /* data block pointer */
420 struct xfs_dir2_data_free *dfp, /* bestfree table pointer */
421 struct xfs_dir2_data_unused *dup, /* unused space */
422 int *loghead) /* log the data header (out) */
423{
424 xfs_dir2_data_free_t new; /* new bestfree entry */
425
426 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
427 hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
428 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
429 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
430
431 new.length = dup->length;
432 new.offset = cpu_to_be16((char *)dup - (char *)hdr);
433
434 /*
435 * Insert at position 0, 1, or 2; or not at all.
436 */
437 if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
438 dfp[2] = dfp[1];
439 dfp[1] = dfp[0];
440 dfp[0] = new;
441 *loghead = 1;
442 return &dfp[0];
443 }
444 if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
445 dfp[2] = dfp[1];
446 dfp[1] = new;
447 *loghead = 1;
448 return &dfp[1];
449 }
450 if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
451 dfp[2] = new;
452 *loghead = 1;
453 return &dfp[2];
454 }
455 return NULL;
456}
457
458/*
459 * Remove a bestfree entry from the table.
460 */
461STATIC void
462xfs_dir2_data_freeremove(
463 struct xfs_dir2_data_hdr *hdr, /* data block header */
464 struct xfs_dir2_data_free *bf, /* bestfree table pointer */
465 struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */
466 int *loghead) /* out: log data header */
467{
468
469 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
470 hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
471 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
472 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
473
474 /*
475 * It's the first entry, slide the next 2 up.
476 */
477 if (dfp == &bf[0]) {
478 bf[0] = bf[1];
479 bf[1] = bf[2];
480 }
481 /*
482 * It's the second entry, slide the 3rd entry up.
483 */
484 else if (dfp == &bf[1])
485 bf[1] = bf[2];
486 /*
487 * Must be the last entry.
488 */
489 else
490 ASSERT(dfp == &bf[2]);
491 /*
492 * Clear the 3rd entry, must be zero now.
493 */
494 bf[2].length = 0;
495 bf[2].offset = 0;
496 *loghead = 1;
497}
498
499/*
500 * Given a data block, reconstruct its bestfree map.
501 */
502void
503xfs_dir2_data_freescan(
504 struct xfs_inode *dp,
505 struct xfs_dir2_data_hdr *hdr,
506 int *loghead)
507{
508 xfs_dir2_block_tail_t *btp; /* block tail */
509 xfs_dir2_data_entry_t *dep; /* active data entry */
510 xfs_dir2_data_unused_t *dup; /* unused data entry */
511 struct xfs_dir2_data_free *bf;
512 char *endp; /* end of block's data */
513 char *p; /* current entry pointer */
514 struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo;
515
516 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
517 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
518 hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
519 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
520
521 /*
522 * Start by clearing the table.
523 */
524 bf = dp->d_ops->data_bestfree_p(hdr);
525 memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
526 *loghead = 1;
527 /*
528 * Set up pointers.
529 */
530 p = (char *)dp->d_ops->data_entry_p(hdr);
531 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
532 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
533 btp = xfs_dir2_block_tail_p(geo, hdr);
534 endp = (char *)xfs_dir2_block_leaf_p(btp);
535 } else
536 endp = (char *)hdr + geo->blksize;
537 /*
538 * Loop over the block's entries.
539 */
540 while (p < endp) {
541 dup = (xfs_dir2_data_unused_t *)p;
542 /*
543 * If it's a free entry, insert it.
544 */
545 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
546 ASSERT((char *)dup - (char *)hdr ==
547 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
548 xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
549 p += be16_to_cpu(dup->length);
550 }
551 /*
552 * For active entries, check their tags and skip them.
553 */
554 else {
555 dep = (xfs_dir2_data_entry_t *)p;
556 ASSERT((char *)dep - (char *)hdr ==
557 be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
558 p += dp->d_ops->data_entsize(dep->namelen);
559 }
560 }
561}
562
563/*
564 * Initialize a data block at the given block number in the directory.
565 * Give back the buffer for the created block.
566 */
567int /* error */
568xfs_dir3_data_init(
569 xfs_da_args_t *args, /* directory operation args */
570 xfs_dir2_db_t blkno, /* logical dir block number */
571 struct xfs_buf **bpp) /* output block buffer */
572{
573 struct xfs_buf *bp; /* block buffer */
574 xfs_dir2_data_hdr_t *hdr; /* data block header */
575 xfs_inode_t *dp; /* incore directory inode */
576 xfs_dir2_data_unused_t *dup; /* unused entry pointer */
577 struct xfs_dir2_data_free *bf;
578 int error; /* error return value */
579 int i; /* bestfree index */
580 xfs_mount_t *mp; /* filesystem mount point */
581 xfs_trans_t *tp; /* transaction pointer */
582 int t; /* temp */
583
584 dp = args->dp;
585 mp = dp->i_mount;
586 tp = args->trans;
587 /*
588 * Get the buffer set up for the block.
589 */
590 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
591 -1, &bp, XFS_DATA_FORK);
592 if (error)
593 return error;
594 bp->b_ops = &xfs_dir3_data_buf_ops;
595 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
596
597 /*
598 * Initialize the header.
599 */
600 hdr = bp->b_addr;
601 if (xfs_sb_version_hascrc(&mp->m_sb)) {
602 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
603
604 memset(hdr3, 0, sizeof(*hdr3));
605 hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
606 hdr3->blkno = cpu_to_be64(bp->b_bn);
607 hdr3->owner = cpu_to_be64(dp->i_ino);
608 uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
609
610 } else
611 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
612
613 bf = dp->d_ops->data_bestfree_p(hdr);
614 bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
615 for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
616 bf[i].length = 0;
617 bf[i].offset = 0;
618 }
619
620 /*
621 * Set up an unused entry for the block's body.
622 */
623 dup = dp->d_ops->data_unused_p(hdr);
624 dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
625
626 t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
627 bf[0].length = cpu_to_be16(t);
628 dup->length = cpu_to_be16(t);
629 *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
630 /*
631 * Log it and return it.
632 */
633 xfs_dir2_data_log_header(args, bp);
634 xfs_dir2_data_log_unused(args, bp, dup);
635 *bpp = bp;
636 return 0;
637}
638
639/*
640 * Log an active data entry from the block.
641 */
642void
643xfs_dir2_data_log_entry(
644 struct xfs_da_args *args,
645 struct xfs_buf *bp,
646 xfs_dir2_data_entry_t *dep) /* data entry pointer */
647{
648 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
649
650 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
651 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
652 hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
653 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
654
655 xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
656 (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
657 (char *)hdr - 1));
658}
659
660/*
661 * Log a data block header.
662 */
663void
664xfs_dir2_data_log_header(
665 struct xfs_da_args *args,
666 struct xfs_buf *bp)
667{
668#ifdef DEBUG
669 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
670
671 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
672 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
673 hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
674 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
675#endif
676
677 xfs_trans_log_buf(args->trans, bp, 0,
678 args->dp->d_ops->data_entry_offset - 1);
679}
680
681/*
682 * Log a data unused entry.
683 */
684void
685xfs_dir2_data_log_unused(
686 struct xfs_da_args *args,
687 struct xfs_buf *bp,
688 xfs_dir2_data_unused_t *dup) /* data unused pointer */
689{
690 xfs_dir2_data_hdr_t *hdr = bp->b_addr;
691
692 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
693 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
694 hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
695 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
696
697 /*
698 * Log the first part of the unused entry.
699 */
700 xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
701 (uint)((char *)&dup->length + sizeof(dup->length) -
702 1 - (char *)hdr));
703 /*
704 * Log the end (tag) of the unused entry.
705 */
706 xfs_trans_log_buf(args->trans, bp,
707 (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
708 (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
709 sizeof(xfs_dir2_data_off_t) - 1));
710}
711
712/*
713 * Make a byte range in the data block unused.
714 * Its current contents are unimportant.
715 */
716void
717xfs_dir2_data_make_free(
718 struct xfs_da_args *args,
719 struct xfs_buf *bp,
720 xfs_dir2_data_aoff_t offset, /* starting byte offset */
721 xfs_dir2_data_aoff_t len, /* length in bytes */
722 int *needlogp, /* out: log header */
723 int *needscanp) /* out: regen bestfree */
724{
725 xfs_dir2_data_hdr_t *hdr; /* data block pointer */
726 xfs_dir2_data_free_t *dfp; /* bestfree pointer */
727 char *endptr; /* end of data area */
728 int needscan; /* need to regen bestfree */
729 xfs_dir2_data_unused_t *newdup; /* new unused entry */
730 xfs_dir2_data_unused_t *postdup; /* unused entry after us */
731 xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
732 struct xfs_dir2_data_free *bf;
733
734 hdr = bp->b_addr;
735
736 /*
737 * Figure out where the end of the data area is.
738 */
739 if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
740 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
741 endptr = (char *)hdr + args->geo->blksize;
742 else {
743 xfs_dir2_block_tail_t *btp; /* block tail */
744
745 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
746 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
747 btp = xfs_dir2_block_tail_p(args->geo, hdr);
748 endptr = (char *)xfs_dir2_block_leaf_p(btp);
749 }
750 /*
751 * If this isn't the start of the block, then back up to
752 * the previous entry and see if it's free.
753 */
754 if (offset > args->dp->d_ops->data_entry_offset) {
755 __be16 *tagp; /* tag just before us */
756
757 tagp = (__be16 *)((char *)hdr + offset) - 1;
758 prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
759 if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
760 prevdup = NULL;
761 } else
762 prevdup = NULL;
763 /*
764 * If this isn't the end of the block, see if the entry after
765 * us is free.
766 */
767 if ((char *)hdr + offset + len < endptr) {
768 postdup =
769 (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
770 if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
771 postdup = NULL;
772 } else
773 postdup = NULL;
774 ASSERT(*needscanp == 0);
775 needscan = 0;
776 /*
777 * Previous and following entries are both free,
778 * merge everything into a single free entry.
779 */
780 bf = args->dp->d_ops->data_bestfree_p(hdr);
781 if (prevdup && postdup) {
782 xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
783
784 /*
785 * See if prevdup and/or postdup are in bestfree table.
786 */
787 dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
788 dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
789 /*
790 * We need a rescan unless there are exactly 2 free entries
791 * namely our two. Then we know what's happening, otherwise
792 * since the third bestfree is there, there might be more
793 * entries.
794 */
795 needscan = (bf[2].length != 0);
796 /*
797 * Fix up the new big freespace.
798 */
799 be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
800 *xfs_dir2_data_unused_tag_p(prevdup) =
801 cpu_to_be16((char *)prevdup - (char *)hdr);
802 xfs_dir2_data_log_unused(args, bp, prevdup);
803 if (!needscan) {
804 /*
805 * Has to be the case that entries 0 and 1 are
806 * dfp and dfp2 (don't know which is which), and
807 * entry 2 is empty.
808 * Remove entry 1 first then entry 0.
809 */
810 ASSERT(dfp && dfp2);
811 if (dfp == &bf[1]) {
812 dfp = &bf[0];
813 ASSERT(dfp2 == dfp);
814 dfp2 = &bf[1];
815 }
816 xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
817 xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
818 /*
819 * Now insert the new entry.
820 */
821 dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
822 needlogp);
823 ASSERT(dfp == &bf[0]);
824 ASSERT(dfp->length == prevdup->length);
825 ASSERT(!dfp[1].length);
826 ASSERT(!dfp[2].length);
827 }
828 }
829 /*
830 * The entry before us is free, merge with it.
831 */
832 else if (prevdup) {
833 dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
834 be16_add_cpu(&prevdup->length, len);
835 *xfs_dir2_data_unused_tag_p(prevdup) =
836 cpu_to_be16((char *)prevdup - (char *)hdr);
837 xfs_dir2_data_log_unused(args, bp, prevdup);
838 /*
839 * If the previous entry was in the table, the new entry
840 * is longer, so it will be in the table too. Remove
841 * the old one and add the new one.
842 */
843 if (dfp) {
844 xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
845 xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
846 }
847 /*
848 * Otherwise we need a scan if the new entry is big enough.
849 */
850 else {
851 needscan = be16_to_cpu(prevdup->length) >
852 be16_to_cpu(bf[2].length);
853 }
854 }
855 /*
856 * The following entry is free, merge with it.
857 */
858 else if (postdup) {
859 dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
860 newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
861 newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
862 newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
863 *xfs_dir2_data_unused_tag_p(newdup) =
864 cpu_to_be16((char *)newdup - (char *)hdr);
865 xfs_dir2_data_log_unused(args, bp, newdup);
866 /*
867 * If the following entry was in the table, the new entry
868 * is longer, so it will be in the table too. Remove
869 * the old one and add the new one.
870 */
871 if (dfp) {
872 xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
873 xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
874 }
875 /*
876 * Otherwise we need a scan if the new entry is big enough.
877 */
878 else {
879 needscan = be16_to_cpu(newdup->length) >
880 be16_to_cpu(bf[2].length);
881 }
882 }
883 /*
884 * Neither neighbor is free. Make a new entry.
885 */
886 else {
887 newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
888 newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
889 newdup->length = cpu_to_be16(len);
890 *xfs_dir2_data_unused_tag_p(newdup) =
891 cpu_to_be16((char *)newdup - (char *)hdr);
892 xfs_dir2_data_log_unused(args, bp, newdup);
893 xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
894 }
895 *needscanp = needscan;
896}
897
898/*
899 * Take a byte range out of an existing unused space and make it un-free.
900 */
901void
902xfs_dir2_data_use_free(
903 struct xfs_da_args *args,
904 struct xfs_buf *bp,
905 xfs_dir2_data_unused_t *dup, /* unused entry */
906 xfs_dir2_data_aoff_t offset, /* starting offset to use */
907 xfs_dir2_data_aoff_t len, /* length to use */
908 int *needlogp, /* out: need to log header */
909 int *needscanp) /* out: need regen bestfree */
910{
911 xfs_dir2_data_hdr_t *hdr; /* data block header */
912 xfs_dir2_data_free_t *dfp; /* bestfree pointer */
913 int matchback; /* matches end of freespace */
914 int matchfront; /* matches start of freespace */
915 int needscan; /* need to regen bestfree */
916 xfs_dir2_data_unused_t *newdup; /* new unused entry */
917 xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
918 int oldlen; /* old unused entry's length */
919 struct xfs_dir2_data_free *bf;
920
921 hdr = bp->b_addr;
922 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
923 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
924 hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
925 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
926 ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
927 ASSERT(offset >= (char *)dup - (char *)hdr);
928 ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
929 ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
930 /*
931 * Look up the entry in the bestfree table.
932 */
933 oldlen = be16_to_cpu(dup->length);
934 bf = args->dp->d_ops->data_bestfree_p(hdr);
935 dfp = xfs_dir2_data_freefind(hdr, bf, dup);
936 ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
937 /*
938 * Check for alignment with front and back of the entry.
939 */
940 matchfront = (char *)dup - (char *)hdr == offset;
941 matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
942 ASSERT(*needscanp == 0);
943 needscan = 0;
944 /*
945 * If we matched it exactly we just need to get rid of it from
946 * the bestfree table.
947 */
948 if (matchfront && matchback) {
949 if (dfp) {
950 needscan = (bf[2].offset != 0);
951 if (!needscan)
952 xfs_dir2_data_freeremove(hdr, bf, dfp,
953 needlogp);
954 }
955 }
956 /*
957 * We match the first part of the entry.
958 * Make a new entry with the remaining freespace.
959 */
960 else if (matchfront) {
961 newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
962 newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
963 newdup->length = cpu_to_be16(oldlen - len);
964 *xfs_dir2_data_unused_tag_p(newdup) =
965 cpu_to_be16((char *)newdup - (char *)hdr);
966 xfs_dir2_data_log_unused(args, bp, newdup);
967 /*
968 * If it was in the table, remove it and add the new one.
969 */
970 if (dfp) {
971 xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
972 dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
973 needlogp);
974 ASSERT(dfp != NULL);
975 ASSERT(dfp->length == newdup->length);
976 ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
977 /*
978 * If we got inserted at the last slot,
979 * that means we don't know if there was a better
980 * choice for the last slot, or not. Rescan.
981 */
982 needscan = dfp == &bf[2];
983 }
984 }
985 /*
986 * We match the last part of the entry.
987 * Trim the allocated space off the tail of the entry.
988 */
989 else if (matchback) {
990 newdup = dup;
991 newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
992 *xfs_dir2_data_unused_tag_p(newdup) =
993 cpu_to_be16((char *)newdup - (char *)hdr);
994 xfs_dir2_data_log_unused(args, bp, newdup);
995 /*
996 * If it was in the table, remove it and add the new one.
997 */
998 if (dfp) {
999 xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
1000 dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
1001 needlogp);
1002 ASSERT(dfp != NULL);
1003 ASSERT(dfp->length == newdup->length);
1004 ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
1005 /*
1006 * If we got inserted at the last slot,
1007 * that means we don't know if there was a better
1008 * choice for the last slot, or not. Rescan.
1009 */
1010 needscan = dfp == &bf[2];
1011 }
1012 }
1013 /*
1014 * Poking out the middle of an entry.
1015 * Make two new entries.
1016 */
1017 else {
1018 newdup = dup;
1019 newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
1020 *xfs_dir2_data_unused_tag_p(newdup) =
1021 cpu_to_be16((char *)newdup - (char *)hdr);
1022 xfs_dir2_data_log_unused(args, bp, newdup);
1023 newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
1024 newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
1025 newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
1026 *xfs_dir2_data_unused_tag_p(newdup2) =
1027 cpu_to_be16((char *)newdup2 - (char *)hdr);
1028 xfs_dir2_data_log_unused(args, bp, newdup2);
1029 /*
1030 * If the old entry was in the table, we need to scan
1031 * if the 3rd entry was valid, since these entries
1032 * are smaller than the old one.
1033 * If we don't need to scan that means there were 1 or 2
1034 * entries in the table, and removing the old and adding
1035 * the 2 new will work.
1036 */
1037 if (dfp) {
1038 needscan = (bf[2].length != 0);
1039 if (!needscan) {
1040 xfs_dir2_data_freeremove(hdr, bf, dfp,
1041 needlogp);
1042 xfs_dir2_data_freeinsert(hdr, bf, newdup,
1043 needlogp);
1044 xfs_dir2_data_freeinsert(hdr, bf, newdup2,
1045 needlogp);
1046 }
1047 }
1048 }
1049 *needscanp = needscan;
1050}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
new file mode 100644
index 000000000000..a19174eb3cb2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -0,0 +1,1831 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27#include "xfs_da_format.h"
28#include "xfs_da_btree.h"
29#include "xfs_inode.h"
30#include "xfs_bmap.h"
31#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h"
33#include "xfs_error.h"
34#include "xfs_trace.h"
35#include "xfs_trans.h"
36#include "xfs_buf_item.h"
37#include "xfs_cksum.h"
38
39/*
40 * Local function declarations.
41 */
42static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
43 int *indexp, struct xfs_buf **dbpp);
44static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
45 struct xfs_buf *bp, int first, int last);
46static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
47 struct xfs_buf *bp);
48
49/*
50 * Check the internal consistency of a leaf1 block.
51 * Pop an assert if something is wrong.
52 */
53#ifdef DEBUG
54#define xfs_dir3_leaf_check(dp, bp) \
55do { \
56 if (!xfs_dir3_leaf1_check((dp), (bp))) \
57 ASSERT(0); \
58} while (0);
59
60STATIC bool
61xfs_dir3_leaf1_check(
62 struct xfs_inode *dp,
63 struct xfs_buf *bp)
64{
65 struct xfs_dir2_leaf *leaf = bp->b_addr;
66 struct xfs_dir3_icleaf_hdr leafhdr;
67
68 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
69
70 if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
71 struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
72 if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
73 return false;
74 } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
75 return false;
76
77 return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
78}
79#else
80#define xfs_dir3_leaf_check(dp, bp)
81#endif
82
83bool
84xfs_dir3_leaf_check_int(
85 struct xfs_mount *mp,
86 struct xfs_inode *dp,
87 struct xfs_dir3_icleaf_hdr *hdr,
88 struct xfs_dir2_leaf *leaf)
89{
90 struct xfs_dir2_leaf_entry *ents;
91 xfs_dir2_leaf_tail_t *ltp;
92 int stale;
93 int i;
94 const struct xfs_dir_ops *ops;
95 struct xfs_dir3_icleaf_hdr leafhdr;
96 struct xfs_da_geometry *geo = mp->m_dir_geo;
97
98 /*
99 * we can be passed a null dp here from a verifier, so we need to go the
100 * hard way to get them.
101 */
102 ops = xfs_dir_get_ops(mp, dp);
103
104 if (!hdr) {
105 ops->leaf_hdr_from_disk(&leafhdr, leaf);
106 hdr = &leafhdr;
107 }
108
109 ents = ops->leaf_ents_p(leaf);
110 ltp = xfs_dir2_leaf_tail_p(geo, leaf);
111
112 /*
113 * XXX (dgc): This value is not restrictive enough.
114 * Should factor in the size of the bests table as well.
115 * We can deduce a value for that from di_size.
116 */
117 if (hdr->count > ops->leaf_max_ents(geo))
118 return false;
119
120 /* Leaves and bests don't overlap in leaf format. */
121 if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
122 hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
123 (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
124 return false;
125
126 /* Check hash value order, count stale entries. */
127 for (i = stale = 0; i < hdr->count; i++) {
128 if (i + 1 < hdr->count) {
129 if (be32_to_cpu(ents[i].hashval) >
130 be32_to_cpu(ents[i + 1].hashval))
131 return false;
132 }
133 if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
134 stale++;
135 }
136 if (hdr->stale != stale)
137 return false;
138 return true;
139}
140
141/*
142 * We verify the magic numbers before decoding the leaf header so that on debug
143 * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
144 * to incorrect magic numbers.
145 */
146static bool
147xfs_dir3_leaf_verify(
148 struct xfs_buf *bp,
149 __uint16_t magic)
150{
151 struct xfs_mount *mp = bp->b_target->bt_mount;
152 struct xfs_dir2_leaf *leaf = bp->b_addr;
153
154 ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
155
156 if (xfs_sb_version_hascrc(&mp->m_sb)) {
157 struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
158 __uint16_t magic3;
159
160 magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
161 : XFS_DIR3_LEAFN_MAGIC;
162
163 if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
164 return false;
165 if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
166 return false;
167 if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
168 return false;
169 } else {
170 if (leaf->hdr.info.magic != cpu_to_be16(magic))
171 return false;
172 }
173
174 return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
175}
176
177static void
178__read_verify(
179 struct xfs_buf *bp,
180 __uint16_t magic)
181{
182 struct xfs_mount *mp = bp->b_target->bt_mount;
183
184 if (xfs_sb_version_hascrc(&mp->m_sb) &&
185 !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
186 xfs_buf_ioerror(bp, -EFSBADCRC);
187 else if (!xfs_dir3_leaf_verify(bp, magic))
188 xfs_buf_ioerror(bp, -EFSCORRUPTED);
189
190 if (bp->b_error)
191 xfs_verifier_error(bp);
192}
193
194static void
195__write_verify(
196 struct xfs_buf *bp,
197 __uint16_t magic)
198{
199 struct xfs_mount *mp = bp->b_target->bt_mount;
200 struct xfs_buf_log_item *bip = bp->b_fspriv;
201 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
202
203 if (!xfs_dir3_leaf_verify(bp, magic)) {
204 xfs_buf_ioerror(bp, -EFSCORRUPTED);
205 xfs_verifier_error(bp);
206 return;
207 }
208
209 if (!xfs_sb_version_hascrc(&mp->m_sb))
210 return;
211
212 if (bip)
213 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
214
215 xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
216}
217
218static void
219xfs_dir3_leaf1_read_verify(
220 struct xfs_buf *bp)
221{
222 __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
223}
224
225static void
226xfs_dir3_leaf1_write_verify(
227 struct xfs_buf *bp)
228{
229 __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
230}
231
232static void
233xfs_dir3_leafn_read_verify(
234 struct xfs_buf *bp)
235{
236 __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
237}
238
239static void
240xfs_dir3_leafn_write_verify(
241 struct xfs_buf *bp)
242{
243 __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
244}
245
246const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
247 .verify_read = xfs_dir3_leaf1_read_verify,
248 .verify_write = xfs_dir3_leaf1_write_verify,
249};
250
251const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
252 .verify_read = xfs_dir3_leafn_read_verify,
253 .verify_write = xfs_dir3_leafn_write_verify,
254};
255
256static int
257xfs_dir3_leaf_read(
258 struct xfs_trans *tp,
259 struct xfs_inode *dp,
260 xfs_dablk_t fbno,
261 xfs_daddr_t mappedbno,
262 struct xfs_buf **bpp)
263{
264 int err;
265
266 err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
267 XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
268 if (!err && tp)
269 xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
270 return err;
271}
272
273int
274xfs_dir3_leafn_read(
275 struct xfs_trans *tp,
276 struct xfs_inode *dp,
277 xfs_dablk_t fbno,
278 xfs_daddr_t mappedbno,
279 struct xfs_buf **bpp)
280{
281 int err;
282
283 err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
284 XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
285 if (!err && tp)
286 xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
287 return err;
288}
289
290/*
291 * Initialize a new leaf block, leaf1 or leafn magic accepted.
292 */
293static void
294xfs_dir3_leaf_init(
295 struct xfs_mount *mp,
296 struct xfs_trans *tp,
297 struct xfs_buf *bp,
298 xfs_ino_t owner,
299 __uint16_t type)
300{
301 struct xfs_dir2_leaf *leaf = bp->b_addr;
302
303 ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
304
305 if (xfs_sb_version_hascrc(&mp->m_sb)) {
306 struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
307
308 memset(leaf3, 0, sizeof(*leaf3));
309
310 leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
311 ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
312 : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
313 leaf3->info.blkno = cpu_to_be64(bp->b_bn);
314 leaf3->info.owner = cpu_to_be64(owner);
315 uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
316 } else {
317 memset(leaf, 0, sizeof(*leaf));
318 leaf->hdr.info.magic = cpu_to_be16(type);
319 }
320
321 /*
322 * If it's a leaf-format directory initialize the tail.
323 * Caller is responsible for initialising the bests table.
324 */
325 if (type == XFS_DIR2_LEAF1_MAGIC) {
326 struct xfs_dir2_leaf_tail *ltp;
327
328 ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
329 ltp->bestcount = 0;
330 bp->b_ops = &xfs_dir3_leaf1_buf_ops;
331 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
332 } else {
333 bp->b_ops = &xfs_dir3_leafn_buf_ops;
334 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
335 }
336}
337
338int
339xfs_dir3_leaf_get_buf(
340 xfs_da_args_t *args,
341 xfs_dir2_db_t bno,
342 struct xfs_buf **bpp,
343 __uint16_t magic)
344{
345 struct xfs_inode *dp = args->dp;
346 struct xfs_trans *tp = args->trans;
347 struct xfs_mount *mp = dp->i_mount;
348 struct xfs_buf *bp;
349 int error;
350
351 ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
352 ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
353 bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
354
355 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
356 -1, &bp, XFS_DATA_FORK);
357 if (error)
358 return error;
359
360 xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
361 xfs_dir3_leaf_log_header(args, bp);
362 if (magic == XFS_DIR2_LEAF1_MAGIC)
363 xfs_dir3_leaf_log_tail(args, bp);
364 *bpp = bp;
365 return 0;
366}
367
368/*
369 * Convert a block form directory to a leaf form directory.
370 */
371int /* error */
372xfs_dir2_block_to_leaf(
373 xfs_da_args_t *args, /* operation arguments */
374 struct xfs_buf *dbp) /* input block's buffer */
375{
376 __be16 *bestsp; /* leaf's bestsp entries */
377 xfs_dablk_t blkno; /* leaf block's bno */
378 xfs_dir2_data_hdr_t *hdr; /* block header */
379 xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */
380 xfs_dir2_block_tail_t *btp; /* block's tail */
381 xfs_inode_t *dp; /* incore directory inode */
382 int error; /* error return code */
383 struct xfs_buf *lbp; /* leaf block's buffer */
384 xfs_dir2_db_t ldb; /* leaf block's bno */
385 xfs_dir2_leaf_t *leaf; /* leaf structure */
386 xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */
387 xfs_mount_t *mp; /* filesystem mount point */
388 int needlog; /* need to log block header */
389 int needscan; /* need to rescan bestfree */
390 xfs_trans_t *tp; /* transaction pointer */
391 struct xfs_dir2_data_free *bf;
392 struct xfs_dir2_leaf_entry *ents;
393 struct xfs_dir3_icleaf_hdr leafhdr;
394
395 trace_xfs_dir2_block_to_leaf(args);
396
397 dp = args->dp;
398 mp = dp->i_mount;
399 tp = args->trans;
400 /*
401 * Add the leaf block to the inode.
402 * This interface will only put blocks in the leaf/node range.
403 * Since that's empty now, we'll get the root (block 0 in range).
404 */
405 if ((error = xfs_da_grow_inode(args, &blkno))) {
406 return error;
407 }
408 ldb = xfs_dir2_da_to_db(args->geo, blkno);
409 ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
410 /*
411 * Initialize the leaf block, get a buffer for it.
412 */
413 error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
414 if (error)
415 return error;
416
417 leaf = lbp->b_addr;
418 hdr = dbp->b_addr;
419 xfs_dir3_data_check(dp, dbp);
420 btp = xfs_dir2_block_tail_p(args->geo, hdr);
421 blp = xfs_dir2_block_leaf_p(btp);
422 bf = dp->d_ops->data_bestfree_p(hdr);
423 ents = dp->d_ops->leaf_ents_p(leaf);
424
425 /*
426 * Set the counts in the leaf header.
427 */
428 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
429 leafhdr.count = be32_to_cpu(btp->count);
430 leafhdr.stale = be32_to_cpu(btp->stale);
431 dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
432 xfs_dir3_leaf_log_header(args, lbp);
433
434 /*
435 * Could compact these but I think we always do the conversion
436 * after squeezing out stale entries.
437 */
438 memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
439 xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
440 needscan = 0;
441 needlog = 1;
442 /*
443 * Make the space formerly occupied by the leaf entries and block
444 * tail be free.
445 */
446 xfs_dir2_data_make_free(args, dbp,
447 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
448 (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
449 (char *)blp),
450 &needlog, &needscan);
451 /*
452 * Fix up the block header, make it a data block.
453 */
454 dbp->b_ops = &xfs_dir3_data_buf_ops;
455 xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
456 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
457 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
458 else
459 hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
460
461 if (needscan)
462 xfs_dir2_data_freescan(dp, hdr, &needlog);
463 /*
464 * Set up leaf tail and bests table.
465 */
466 ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
467 ltp->bestcount = cpu_to_be32(1);
468 bestsp = xfs_dir2_leaf_bests_p(ltp);
469 bestsp[0] = bf[0].length;
470 /*
471 * Log the data header and leaf bests table.
472 */
473 if (needlog)
474 xfs_dir2_data_log_header(args, dbp);
475 xfs_dir3_leaf_check(dp, lbp);
476 xfs_dir3_data_check(dp, dbp);
477 xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
478 return 0;
479}
480
481STATIC void
482xfs_dir3_leaf_find_stale(
483 struct xfs_dir3_icleaf_hdr *leafhdr,
484 struct xfs_dir2_leaf_entry *ents,
485 int index,
486 int *lowstale,
487 int *highstale)
488{
489 /*
490 * Find the first stale entry before our index, if any.
491 */
492 for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
493 if (ents[*lowstale].address ==
494 cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
495 break;
496 }
497
498 /*
499 * Find the first stale entry at or after our index, if any.
500 * Stop if the result would require moving more entries than using
501 * lowstale.
502 */
503 for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
504 if (ents[*highstale].address ==
505 cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
506 break;
507 if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
508 break;
509 }
510}
511
512struct xfs_dir2_leaf_entry *
513xfs_dir3_leaf_find_entry(
514 struct xfs_dir3_icleaf_hdr *leafhdr,
515 struct xfs_dir2_leaf_entry *ents,
516 int index, /* leaf table position */
517 int compact, /* need to compact leaves */
518 int lowstale, /* index of prev stale leaf */
519 int highstale, /* index of next stale leaf */
520 int *lfloglow, /* low leaf logging index */
521 int *lfloghigh) /* high leaf logging index */
522{
523 if (!leafhdr->stale) {
524 xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
525
526 /*
527 * Now we need to make room to insert the leaf entry.
528 *
529 * If there are no stale entries, just insert a hole at index.
530 */
531 lep = &ents[index];
532 if (index < leafhdr->count)
533 memmove(lep + 1, lep,
534 (leafhdr->count - index) * sizeof(*lep));
535
536 /*
537 * Record low and high logging indices for the leaf.
538 */
539 *lfloglow = index;
540 *lfloghigh = leafhdr->count++;
541 return lep;
542 }
543
544 /*
545 * There are stale entries.
546 *
547 * We will use one of them for the new entry. It's probably not at
548 * the right location, so we'll have to shift some up or down first.
549 *
550 * If we didn't compact before, we need to find the nearest stale
551 * entries before and after our insertion point.
552 */
553 if (compact == 0)
554 xfs_dir3_leaf_find_stale(leafhdr, ents, index,
555 &lowstale, &highstale);
556
557 /*
558 * If the low one is better, use it.
559 */
560 if (lowstale >= 0 &&
561 (highstale == leafhdr->count ||
562 index - lowstale - 1 < highstale - index)) {
563 ASSERT(index - lowstale - 1 >= 0);
564 ASSERT(ents[lowstale].address ==
565 cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
566
567 /*
568 * Copy entries up to cover the stale entry and make room
569 * for the new entry.
570 */
571 if (index - lowstale - 1 > 0) {
572 memmove(&ents[lowstale], &ents[lowstale + 1],
573 (index - lowstale - 1) *
574 sizeof(xfs_dir2_leaf_entry_t));
575 }
576 *lfloglow = MIN(lowstale, *lfloglow);
577 *lfloghigh = MAX(index - 1, *lfloghigh);
578 leafhdr->stale--;
579 return &ents[index - 1];
580 }
581
582 /*
583 * The high one is better, so use that one.
584 */
585 ASSERT(highstale - index >= 0);
586 ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
587
588 /*
589 * Copy entries down to cover the stale entry and make room for the
590 * new entry.
591 */
592 if (highstale - index > 0) {
593 memmove(&ents[index + 1], &ents[index],
594 (highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
595 }
596 *lfloglow = MIN(index, *lfloglow);
597 *lfloghigh = MAX(highstale, *lfloghigh);
598 leafhdr->stale--;
599 return &ents[index];
600}
601
602/*
603 * Add an entry to a leaf form directory.
604 */
605int /* error */
606xfs_dir2_leaf_addname(
607 xfs_da_args_t *args) /* operation arguments */
608{
609 __be16 *bestsp; /* freespace table in leaf */
610 int compact; /* need to compact leaves */
611 xfs_dir2_data_hdr_t *hdr; /* data block header */
612 struct xfs_buf *dbp; /* data block buffer */
613 xfs_dir2_data_entry_t *dep; /* data block entry */
614 xfs_inode_t *dp; /* incore directory inode */
615 xfs_dir2_data_unused_t *dup; /* data unused entry */
616 int error; /* error return value */
617 int grown; /* allocated new data block */
618 int highstale; /* index of next stale leaf */
619 int i; /* temporary, index */
620 int index; /* leaf table position */
621 struct xfs_buf *lbp; /* leaf's buffer */
622 xfs_dir2_leaf_t *leaf; /* leaf structure */
623 int length; /* length of new entry */
624 xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
625 int lfloglow; /* low leaf logging index */
626 int lfloghigh; /* high leaf logging index */
627 int lowstale; /* index of prev stale leaf */
628 xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
629 xfs_mount_t *mp; /* filesystem mount point */
630 int needbytes; /* leaf block bytes needed */
631 int needlog; /* need to log data header */
632 int needscan; /* need to rescan data free */
633 __be16 *tagp; /* end of data entry */
634 xfs_trans_t *tp; /* transaction pointer */
635 xfs_dir2_db_t use_block; /* data block number */
636 struct xfs_dir2_data_free *bf; /* bestfree table */
637 struct xfs_dir2_leaf_entry *ents;
638 struct xfs_dir3_icleaf_hdr leafhdr;
639
640 trace_xfs_dir2_leaf_addname(args);
641
642 dp = args->dp;
643 tp = args->trans;
644 mp = dp->i_mount;
645
646 error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
647 if (error)
648 return error;
649
650 /*
651 * Look up the entry by hash value and name.
652 * We know it's not there, our caller has already done a lookup.
653 * So the index is of the entry to insert in front of.
654 * But if there are dup hash values the index is of the first of those.
655 */
656 index = xfs_dir2_leaf_search_hash(args, lbp);
657 leaf = lbp->b_addr;
658 ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
659 ents = dp->d_ops->leaf_ents_p(leaf);
660 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
661 bestsp = xfs_dir2_leaf_bests_p(ltp);
662 length = dp->d_ops->data_entsize(args->namelen);
663
664 /*
665 * See if there are any entries with the same hash value
666 * and space in their block for the new entry.
667 * This is good because it puts multiple same-hash value entries
668 * in a data block, improving the lookup of those entries.
669 */
670 for (use_block = -1, lep = &ents[index];
671 index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
672 index++, lep++) {
673 if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
674 continue;
675 i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
676 ASSERT(i < be32_to_cpu(ltp->bestcount));
677 ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
678 if (be16_to_cpu(bestsp[i]) >= length) {
679 use_block = i;
680 break;
681 }
682 }
683 /*
684 * Didn't find a block yet, linear search all the data blocks.
685 */
686 if (use_block == -1) {
687 for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
688 /*
689 * Remember a block we see that's missing.
690 */
691 if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
692 use_block == -1)
693 use_block = i;
694 else if (be16_to_cpu(bestsp[i]) >= length) {
695 use_block = i;
696 break;
697 }
698 }
699 }
700 /*
701 * How many bytes do we need in the leaf block?
702 */
703 needbytes = 0;
704 if (!leafhdr.stale)
705 needbytes += sizeof(xfs_dir2_leaf_entry_t);
706 if (use_block == -1)
707 needbytes += sizeof(xfs_dir2_data_off_t);
708
709 /*
710 * Now kill use_block if it refers to a missing block, so we
711 * can use it as an indication of allocation needed.
712 */
713 if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
714 use_block = -1;
715 /*
716 * If we don't have enough free bytes but we can make enough
717 * by compacting out stale entries, we'll do that.
718 */
719 if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
720 leafhdr.stale > 1)
721 compact = 1;
722
723 /*
724 * Otherwise if we don't have enough free bytes we need to
725 * convert to node form.
726 */
727 else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
728 /*
729 * Just checking or no space reservation, give up.
730 */
731 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
732 args->total == 0) {
733 xfs_trans_brelse(tp, lbp);
734 return -ENOSPC;
735 }
736 /*
737 * Convert to node form.
738 */
739 error = xfs_dir2_leaf_to_node(args, lbp);
740 if (error)
741 return error;
742 /*
743 * Then add the new entry.
744 */
745 return xfs_dir2_node_addname(args);
746 }
747 /*
748 * Otherwise it will fit without compaction.
749 */
750 else
751 compact = 0;
752 /*
753 * If just checking, then it will fit unless we needed to allocate
754 * a new data block.
755 */
756 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
757 xfs_trans_brelse(tp, lbp);
758 return use_block == -1 ? -ENOSPC : 0;
759 }
760 /*
761 * If no allocations are allowed, return now before we've
762 * changed anything.
763 */
764 if (args->total == 0 && use_block == -1) {
765 xfs_trans_brelse(tp, lbp);
766 return -ENOSPC;
767 }
768 /*
769 * Need to compact the leaf entries, removing stale ones.
770 * Leave one stale entry behind - the one closest to our
771 * insertion index - and we'll shift that one to our insertion
772 * point later.
773 */
774 if (compact) {
775 xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
776 &highstale, &lfloglow, &lfloghigh);
777 }
778 /*
779 * There are stale entries, so we'll need log-low and log-high
780 * impossibly bad values later.
781 */
782 else if (leafhdr.stale) {
783 lfloglow = leafhdr.count;
784 lfloghigh = -1;
785 }
786 /*
787 * If there was no data block space found, we need to allocate
788 * a new one.
789 */
790 if (use_block == -1) {
791 /*
792 * Add the new data block.
793 */
794 if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
795 &use_block))) {
796 xfs_trans_brelse(tp, lbp);
797 return error;
798 }
799 /*
800 * Initialize the block.
801 */
802 if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
803 xfs_trans_brelse(tp, lbp);
804 return error;
805 }
806 /*
807 * If we're adding a new data block on the end we need to
808 * extend the bests table. Copy it up one entry.
809 */
810 if (use_block >= be32_to_cpu(ltp->bestcount)) {
811 bestsp--;
812 memmove(&bestsp[0], &bestsp[1],
813 be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
814 be32_add_cpu(&ltp->bestcount, 1);
815 xfs_dir3_leaf_log_tail(args, lbp);
816 xfs_dir3_leaf_log_bests(args, lbp, 0,
817 be32_to_cpu(ltp->bestcount) - 1);
818 }
819 /*
820 * If we're filling in a previously empty block just log it.
821 */
822 else
823 xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
824 hdr = dbp->b_addr;
825 bf = dp->d_ops->data_bestfree_p(hdr);
826 bestsp[use_block] = bf[0].length;
827 grown = 1;
828 } else {
829 /*
830 * Already had space in some data block.
831 * Just read that one in.
832 */
833 error = xfs_dir3_data_read(tp, dp,
834 xfs_dir2_db_to_da(args->geo, use_block),
835 -1, &dbp);
836 if (error) {
837 xfs_trans_brelse(tp, lbp);
838 return error;
839 }
840 hdr = dbp->b_addr;
841 bf = dp->d_ops->data_bestfree_p(hdr);
842 grown = 0;
843 }
844 /*
845 * Point to the biggest freespace in our data block.
846 */
847 dup = (xfs_dir2_data_unused_t *)
848 ((char *)hdr + be16_to_cpu(bf[0].offset));
849 ASSERT(be16_to_cpu(dup->length) >= length);
850 needscan = needlog = 0;
851 /*
852 * Mark the initial part of our freespace in use for the new entry.
853 */
854 xfs_dir2_data_use_free(args, dbp, dup,
855 (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
856 &needlog, &needscan);
857 /*
858 * Initialize our new entry (at last).
859 */
860 dep = (xfs_dir2_data_entry_t *)dup;
861 dep->inumber = cpu_to_be64(args->inumber);
862 dep->namelen = args->namelen;
863 memcpy(dep->name, args->name, dep->namelen);
864 dp->d_ops->data_put_ftype(dep, args->filetype);
865 tagp = dp->d_ops->data_entry_tag_p(dep);
866 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
867 /*
868 * Need to scan fix up the bestfree table.
869 */
870 if (needscan)
871 xfs_dir2_data_freescan(dp, hdr, &needlog);
872 /*
873 * Need to log the data block's header.
874 */
875 if (needlog)
876 xfs_dir2_data_log_header(args, dbp);
877 xfs_dir2_data_log_entry(args, dbp, dep);
878 /*
879 * If the bests table needs to be changed, do it.
880 * Log the change unless we've already done that.
881 */
882 if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
883 bestsp[use_block] = bf[0].length;
884 if (!grown)
885 xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
886 }
887
888 lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
889 highstale, &lfloglow, &lfloghigh);
890
891 /*
892 * Fill in the new leaf entry.
893 */
894 lep->hashval = cpu_to_be32(args->hashval);
895 lep->address = cpu_to_be32(
896 xfs_dir2_db_off_to_dataptr(args->geo, use_block,
897 be16_to_cpu(*tagp)));
898 /*
899 * Log the leaf fields and give up the buffers.
900 */
901 dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
902 xfs_dir3_leaf_log_header(args, lbp);
903 xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
904 xfs_dir3_leaf_check(dp, lbp);
905 xfs_dir3_data_check(dp, dbp);
906 return 0;
907}
908
909/*
910 * Compact out any stale entries in the leaf.
911 * Log the header and changed leaf entries, if any.
912 */
913void
914xfs_dir3_leaf_compact(
915 xfs_da_args_t *args, /* operation arguments */
916 struct xfs_dir3_icleaf_hdr *leafhdr,
917 struct xfs_buf *bp) /* leaf buffer */
918{
919 int from; /* source leaf index */
920 xfs_dir2_leaf_t *leaf; /* leaf structure */
921 int loglow; /* first leaf entry to log */
922 int to; /* target leaf index */
923 struct xfs_dir2_leaf_entry *ents;
924 struct xfs_inode *dp = args->dp;
925
926 leaf = bp->b_addr;
927 if (!leafhdr->stale)
928 return;
929
930 /*
931 * Compress out the stale entries in place.
932 */
933 ents = dp->d_ops->leaf_ents_p(leaf);
934 for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
935 if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
936 continue;
937 /*
938 * Only actually copy the entries that are different.
939 */
940 if (from > to) {
941 if (loglow == -1)
942 loglow = to;
943 ents[to] = ents[from];
944 }
945 to++;
946 }
947 /*
948 * Update and log the header, log the leaf entries.
949 */
950 ASSERT(leafhdr->stale == from - to);
951 leafhdr->count -= leafhdr->stale;
952 leafhdr->stale = 0;
953
954 dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
955 xfs_dir3_leaf_log_header(args, bp);
956 if (loglow != -1)
957 xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
958}
959
960/*
961 * Compact the leaf entries, removing stale ones.
962 * Leave one stale entry behind - the one closest to our
963 * insertion index - and the caller will shift that one to our insertion
964 * point later.
965 * Return new insertion index, where the remaining stale entry is,
966 * and leaf logging indices.
967 */
968void
969xfs_dir3_leaf_compact_x1(
970 struct xfs_dir3_icleaf_hdr *leafhdr,
971 struct xfs_dir2_leaf_entry *ents,
972 int *indexp, /* insertion index */
973 int *lowstalep, /* out: stale entry before us */
974 int *highstalep, /* out: stale entry after us */
975 int *lowlogp, /* out: low log index */
976 int *highlogp) /* out: high log index */
977{
978 int from; /* source copy index */
979 int highstale; /* stale entry at/after index */
980 int index; /* insertion index */
981 int keepstale; /* source index of kept stale */
982 int lowstale; /* stale entry before index */
983 int newindex=0; /* new insertion index */
984 int to; /* destination copy index */
985
986 ASSERT(leafhdr->stale > 1);
987 index = *indexp;
988
989 xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
990
991 /*
992 * Pick the better of lowstale and highstale.
993 */
994 if (lowstale >= 0 &&
995 (highstale == leafhdr->count ||
996 index - lowstale <= highstale - index))
997 keepstale = lowstale;
998 else
999 keepstale = highstale;
1000 /*
1001 * Copy the entries in place, removing all the stale entries
1002 * except keepstale.
1003 */
1004 for (from = to = 0; from < leafhdr->count; from++) {
1005 /*
1006 * Notice the new value of index.
1007 */
1008 if (index == from)
1009 newindex = to;
1010 if (from != keepstale &&
1011 ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
1012 if (from == to)
1013 *lowlogp = to;
1014 continue;
1015 }
1016 /*
1017 * Record the new keepstale value for the insertion.
1018 */
1019 if (from == keepstale)
1020 lowstale = highstale = to;
1021 /*
1022 * Copy only the entries that have moved.
1023 */
1024 if (from > to)
1025 ents[to] = ents[from];
1026 to++;
1027 }
1028 ASSERT(from > to);
1029 /*
1030 * If the insertion point was past the last entry,
1031 * set the new insertion point accordingly.
1032 */
1033 if (index == from)
1034 newindex = to;
1035 *indexp = newindex;
1036 /*
1037 * Adjust the leaf header values.
1038 */
1039 leafhdr->count -= from - to;
1040 leafhdr->stale = 1;
1041 /*
1042 * Remember the low/high stale value only in the "right"
1043 * direction.
1044 */
1045 if (lowstale >= newindex)
1046 lowstale = -1;
1047 else
1048 highstale = leafhdr->count;
1049 *highlogp = leafhdr->count - 1;
1050 *lowstalep = lowstale;
1051 *highstalep = highstale;
1052}
1053
1054/*
1055 * Log the bests entries indicated from a leaf1 block.
1056 */
1057static void
1058xfs_dir3_leaf_log_bests(
1059 struct xfs_da_args *args,
1060 struct xfs_buf *bp, /* leaf buffer */
1061 int first, /* first entry to log */
1062 int last) /* last entry to log */
1063{
1064 __be16 *firstb; /* pointer to first entry */
1065 __be16 *lastb; /* pointer to last entry */
1066 struct xfs_dir2_leaf *leaf = bp->b_addr;
1067 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1068
1069 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
1070 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
1071
1072 ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
1073 firstb = xfs_dir2_leaf_bests_p(ltp) + first;
1074 lastb = xfs_dir2_leaf_bests_p(ltp) + last;
1075 xfs_trans_log_buf(args->trans, bp,
1076 (uint)((char *)firstb - (char *)leaf),
1077 (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
1078}
1079
1080/*
1081 * Log the leaf entries indicated from a leaf1 or leafn block.
1082 */
1083void
1084xfs_dir3_leaf_log_ents(
1085 struct xfs_da_args *args,
1086 struct xfs_buf *bp,
1087 int first,
1088 int last)
1089{
1090 xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
1091 xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
1092 struct xfs_dir2_leaf *leaf = bp->b_addr;
1093 struct xfs_dir2_leaf_entry *ents;
1094
1095 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
1096 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
1097 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1098 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
1099
1100 ents = args->dp->d_ops->leaf_ents_p(leaf);
1101 firstlep = &ents[first];
1102 lastlep = &ents[last];
1103 xfs_trans_log_buf(args->trans, bp,
1104 (uint)((char *)firstlep - (char *)leaf),
1105 (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
1106}
1107
1108/*
1109 * Log the header of the leaf1 or leafn block.
1110 */
1111void
1112xfs_dir3_leaf_log_header(
1113 struct xfs_da_args *args,
1114 struct xfs_buf *bp)
1115{
1116 struct xfs_dir2_leaf *leaf = bp->b_addr;
1117
1118 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
1119 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
1120 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1121 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
1122
1123 xfs_trans_log_buf(args->trans, bp,
1124 (uint)((char *)&leaf->hdr - (char *)leaf),
1125 args->dp->d_ops->leaf_hdr_size - 1);
1126}
1127
1128/*
1129 * Log the tail of the leaf1 block.
1130 */
1131STATIC void
1132xfs_dir3_leaf_log_tail(
1133 struct xfs_da_args *args,
1134 struct xfs_buf *bp)
1135{
1136 struct xfs_dir2_leaf *leaf = bp->b_addr;
1137 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1138
1139 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
1140 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
1141 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1142 leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
1143
1144 ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
1145 xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
1146 (uint)(args->geo->blksize - 1));
1147}
1148
1149/*
1150 * Look up the entry referred to by args in the leaf format directory.
1151 * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
1152 * is also used by the node-format code.
1153 */
1154int
1155xfs_dir2_leaf_lookup(
1156 xfs_da_args_t *args) /* operation arguments */
1157{
1158 struct xfs_buf *dbp; /* data block buffer */
1159 xfs_dir2_data_entry_t *dep; /* data block entry */
1160 xfs_inode_t *dp; /* incore directory inode */
1161 int error; /* error return code */
1162 int index; /* found entry index */
1163 struct xfs_buf *lbp; /* leaf buffer */
1164 xfs_dir2_leaf_t *leaf; /* leaf structure */
1165 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1166 xfs_trans_t *tp; /* transaction pointer */
1167 struct xfs_dir2_leaf_entry *ents;
1168
1169 trace_xfs_dir2_leaf_lookup(args);
1170
1171 /*
1172 * Look up name in the leaf block, returning both buffers and index.
1173 */
1174 if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
1175 return error;
1176 }
1177 tp = args->trans;
1178 dp = args->dp;
1179 xfs_dir3_leaf_check(dp, lbp);
1180 leaf = lbp->b_addr;
1181 ents = dp->d_ops->leaf_ents_p(leaf);
1182 /*
1183 * Get to the leaf entry and contained data entry address.
1184 */
1185 lep = &ents[index];
1186
1187 /*
1188 * Point to the data entry.
1189 */
1190 dep = (xfs_dir2_data_entry_t *)
1191 ((char *)dbp->b_addr +
1192 xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
1193 /*
1194 * Return the found inode number & CI name if appropriate
1195 */
1196 args->inumber = be64_to_cpu(dep->inumber);
1197 args->filetype = dp->d_ops->data_get_ftype(dep);
1198 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
1199 xfs_trans_brelse(tp, dbp);
1200 xfs_trans_brelse(tp, lbp);
1201 return error;
1202}
1203
1204/*
1205 * Look up name/hash in the leaf block.
1206 * Fill in indexp with the found index, and dbpp with the data buffer.
1207 * If not found dbpp will be NULL, and ENOENT comes back.
1208 * lbpp will always be filled in with the leaf buffer unless there's an error.
1209 */
1210static int /* error */
1211xfs_dir2_leaf_lookup_int(
1212 xfs_da_args_t *args, /* operation arguments */
1213 struct xfs_buf **lbpp, /* out: leaf buffer */
1214 int *indexp, /* out: index in leaf block */
1215 struct xfs_buf **dbpp) /* out: data buffer */
1216{
1217 xfs_dir2_db_t curdb = -1; /* current data block number */
1218 struct xfs_buf *dbp = NULL; /* data buffer */
1219 xfs_dir2_data_entry_t *dep; /* data entry */
1220 xfs_inode_t *dp; /* incore directory inode */
1221 int error; /* error return code */
1222 int index; /* index in leaf block */
1223 struct xfs_buf *lbp; /* leaf buffer */
1224 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1225 xfs_dir2_leaf_t *leaf; /* leaf structure */
1226 xfs_mount_t *mp; /* filesystem mount point */
1227 xfs_dir2_db_t newdb; /* new data block number */
1228 xfs_trans_t *tp; /* transaction pointer */
1229 xfs_dir2_db_t cidb = -1; /* case match data block no. */
1230 enum xfs_dacmp cmp; /* name compare result */
1231 struct xfs_dir2_leaf_entry *ents;
1232 struct xfs_dir3_icleaf_hdr leafhdr;
1233
1234 dp = args->dp;
1235 tp = args->trans;
1236 mp = dp->i_mount;
1237
1238 error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
1239 if (error)
1240 return error;
1241
1242 *lbpp = lbp;
1243 leaf = lbp->b_addr;
1244 xfs_dir3_leaf_check(dp, lbp);
1245 ents = dp->d_ops->leaf_ents_p(leaf);
1246 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
1247
1248 /*
1249 * Look for the first leaf entry with our hash value.
1250 */
1251 index = xfs_dir2_leaf_search_hash(args, lbp);
1252 /*
1253 * Loop over all the entries with the right hash value
1254 * looking to match the name.
1255 */
1256 for (lep = &ents[index];
1257 index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
1258 lep++, index++) {
1259 /*
1260 * Skip over stale leaf entries.
1261 */
1262 if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
1263 continue;
1264 /*
1265 * Get the new data block number.
1266 */
1267 newdb = xfs_dir2_dataptr_to_db(args->geo,
1268 be32_to_cpu(lep->address));
1269 /*
1270 * If it's not the same as the old data block number,
1271 * need to pitch the old one and read the new one.
1272 */
1273 if (newdb != curdb) {
1274 if (dbp)
1275 xfs_trans_brelse(tp, dbp);
1276 error = xfs_dir3_data_read(tp, dp,
1277 xfs_dir2_db_to_da(args->geo, newdb),
1278 -1, &dbp);
1279 if (error) {
1280 xfs_trans_brelse(tp, lbp);
1281 return error;
1282 }
1283 curdb = newdb;
1284 }
1285 /*
1286 * Point to the data entry.
1287 */
1288 dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
1289 xfs_dir2_dataptr_to_off(args->geo,
1290 be32_to_cpu(lep->address)));
1291 /*
1292 * Compare name and if it's an exact match, return the index
1293 * and buffer. If it's the first case-insensitive match, store
1294 * the index and buffer and continue looking for an exact match.
1295 */
1296 cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
1297 if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
1298 args->cmpresult = cmp;
1299 *indexp = index;
1300 /* case exact match: return the current buffer. */
1301 if (cmp == XFS_CMP_EXACT) {
1302 *dbpp = dbp;
1303 return 0;
1304 }
1305 cidb = curdb;
1306 }
1307 }
1308 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
1309 /*
1310 * Here, we can only be doing a lookup (not a rename or remove).
1311 * If a case-insensitive match was found earlier, re-read the
1312 * appropriate data block if required and return it.
1313 */
1314 if (args->cmpresult == XFS_CMP_CASE) {
1315 ASSERT(cidb != -1);
1316 if (cidb != curdb) {
1317 xfs_trans_brelse(tp, dbp);
1318 error = xfs_dir3_data_read(tp, dp,
1319 xfs_dir2_db_to_da(args->geo, cidb),
1320 -1, &dbp);
1321 if (error) {
1322 xfs_trans_brelse(tp, lbp);
1323 return error;
1324 }
1325 }
1326 *dbpp = dbp;
1327 return 0;
1328 }
1329 /*
1330 * No match found, return -ENOENT.
1331 */
1332 ASSERT(cidb == -1);
1333 if (dbp)
1334 xfs_trans_brelse(tp, dbp);
1335 xfs_trans_brelse(tp, lbp);
1336 return -ENOENT;
1337}
1338
1339/*
1340 * Remove an entry from a leaf format directory.
1341 */
1342int /* error */
1343xfs_dir2_leaf_removename(
1344 xfs_da_args_t *args) /* operation arguments */
1345{
1346 __be16 *bestsp; /* leaf block best freespace */
1347 xfs_dir2_data_hdr_t *hdr; /* data block header */
1348 xfs_dir2_db_t db; /* data block number */
1349 struct xfs_buf *dbp; /* data block buffer */
1350 xfs_dir2_data_entry_t *dep; /* data entry structure */
1351 xfs_inode_t *dp; /* incore directory inode */
1352 int error; /* error return code */
1353 xfs_dir2_db_t i; /* temporary data block # */
1354 int index; /* index into leaf entries */
1355 struct xfs_buf *lbp; /* leaf buffer */
1356 xfs_dir2_leaf_t *leaf; /* leaf structure */
1357 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1358 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1359 xfs_mount_t *mp; /* filesystem mount point */
1360 int needlog; /* need to log data header */
1361 int needscan; /* need to rescan data frees */
1362 xfs_dir2_data_off_t oldbest; /* old value of best free */
1363 xfs_trans_t *tp; /* transaction pointer */
1364 struct xfs_dir2_data_free *bf; /* bestfree table */
1365 struct xfs_dir2_leaf_entry *ents;
1366 struct xfs_dir3_icleaf_hdr leafhdr;
1367
1368 trace_xfs_dir2_leaf_removename(args);
1369
1370 /*
1371 * Lookup the leaf entry, get the leaf and data blocks read in.
1372 */
1373 if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
1374 return error;
1375 }
1376 dp = args->dp;
1377 tp = args->trans;
1378 mp = dp->i_mount;
1379 leaf = lbp->b_addr;
1380 hdr = dbp->b_addr;
1381 xfs_dir3_data_check(dp, dbp);
1382 bf = dp->d_ops->data_bestfree_p(hdr);
1383 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
1384 ents = dp->d_ops->leaf_ents_p(leaf);
1385 /*
1386 * Point to the leaf entry, use that to point to the data entry.
1387 */
1388 lep = &ents[index];
1389 db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
1390 dep = (xfs_dir2_data_entry_t *)((char *)hdr +
1391 xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
1392 needscan = needlog = 0;
1393 oldbest = be16_to_cpu(bf[0].length);
1394 ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
1395 bestsp = xfs_dir2_leaf_bests_p(ltp);
1396 ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
1397 /*
1398 * Mark the former data entry unused.
1399 */
1400 xfs_dir2_data_make_free(args, dbp,
1401 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
1402 dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
1403 /*
1404 * We just mark the leaf entry stale by putting a null in it.
1405 */
1406 leafhdr.stale++;
1407 dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
1408 xfs_dir3_leaf_log_header(args, lbp);
1409
1410 lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
1411 xfs_dir3_leaf_log_ents(args, lbp, index, index);
1412
1413 /*
1414 * Scan the freespace in the data block again if necessary,
1415 * log the data block header if necessary.
1416 */
1417 if (needscan)
1418 xfs_dir2_data_freescan(dp, hdr, &needlog);
1419 if (needlog)
1420 xfs_dir2_data_log_header(args, dbp);
1421 /*
1422 * If the longest freespace in the data block has changed,
1423 * put the new value in the bests table and log that.
1424 */
1425 if (be16_to_cpu(bf[0].length) != oldbest) {
1426 bestsp[db] = bf[0].length;
1427 xfs_dir3_leaf_log_bests(args, lbp, db, db);
1428 }
1429 xfs_dir3_data_check(dp, dbp);
1430 /*
1431 * If the data block is now empty then get rid of the data block.
1432 */
1433 if (be16_to_cpu(bf[0].length) ==
1434 args->geo->blksize - dp->d_ops->data_entry_offset) {
1435 ASSERT(db != args->geo->datablk);
1436 if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
1437 /*
1438 * Nope, can't get rid of it because it caused
1439 * allocation of a bmap btree block to do so.
1440 * Just go on, returning success, leaving the
1441 * empty block in place.
1442 */
1443 if (error == -ENOSPC && args->total == 0)
1444 error = 0;
1445 xfs_dir3_leaf_check(dp, lbp);
1446 return error;
1447 }
1448 dbp = NULL;
1449 /*
1450 * If this is the last data block then compact the
1451 * bests table by getting rid of entries.
1452 */
1453 if (db == be32_to_cpu(ltp->bestcount) - 1) {
1454 /*
1455 * Look for the last active entry (i).
1456 */
1457 for (i = db - 1; i > 0; i--) {
1458 if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
1459 break;
1460 }
1461 /*
1462 * Copy the table down so inactive entries at the
1463 * end are removed.
1464 */
1465 memmove(&bestsp[db - i], bestsp,
1466 (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
1467 be32_add_cpu(&ltp->bestcount, -(db - i));
1468 xfs_dir3_leaf_log_tail(args, lbp);
1469 xfs_dir3_leaf_log_bests(args, lbp, 0,
1470 be32_to_cpu(ltp->bestcount) - 1);
1471 } else
1472 bestsp[db] = cpu_to_be16(NULLDATAOFF);
1473 }
1474 /*
1475 * If the data block was not the first one, drop it.
1476 */
1477 else if (db != args->geo->datablk)
1478 dbp = NULL;
1479
1480 xfs_dir3_leaf_check(dp, lbp);
1481 /*
1482 * See if we can convert to block form.
1483 */
1484 return xfs_dir2_leaf_to_block(args, lbp, dbp);
1485}
1486
1487/*
1488 * Replace the inode number in a leaf format directory entry.
1489 */
1490int /* error */
1491xfs_dir2_leaf_replace(
1492 xfs_da_args_t *args) /* operation arguments */
1493{
1494 struct xfs_buf *dbp; /* data block buffer */
1495 xfs_dir2_data_entry_t *dep; /* data block entry */
1496 xfs_inode_t *dp; /* incore directory inode */
1497 int error; /* error return code */
1498 int index; /* index of leaf entry */
1499 struct xfs_buf *lbp; /* leaf buffer */
1500 xfs_dir2_leaf_t *leaf; /* leaf structure */
1501 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1502 xfs_trans_t *tp; /* transaction pointer */
1503 struct xfs_dir2_leaf_entry *ents;
1504
1505 trace_xfs_dir2_leaf_replace(args);
1506
1507 /*
1508 * Look up the entry.
1509 */
1510 if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
1511 return error;
1512 }
1513 dp = args->dp;
1514 leaf = lbp->b_addr;
1515 ents = dp->d_ops->leaf_ents_p(leaf);
1516 /*
1517 * Point to the leaf entry, get data address from it.
1518 */
1519 lep = &ents[index];
1520 /*
1521 * Point to the data entry.
1522 */
1523 dep = (xfs_dir2_data_entry_t *)
1524 ((char *)dbp->b_addr +
1525 xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
1526 ASSERT(args->inumber != be64_to_cpu(dep->inumber));
1527 /*
1528 * Put the new inode number in, log it.
1529 */
1530 dep->inumber = cpu_to_be64(args->inumber);
1531 dp->d_ops->data_put_ftype(dep, args->filetype);
1532 tp = args->trans;
1533 xfs_dir2_data_log_entry(args, dbp, dep);
1534 xfs_dir3_leaf_check(dp, lbp);
1535 xfs_trans_brelse(tp, lbp);
1536 return 0;
1537}
1538
1539/*
1540 * Return index in the leaf block (lbp) which is either the first
1541 * one with this hash value, or if there are none, the insert point
1542 * for that hash value.
1543 */
1544int /* index value */
1545xfs_dir2_leaf_search_hash(
1546 xfs_da_args_t *args, /* operation arguments */
1547 struct xfs_buf *lbp) /* leaf buffer */
1548{
1549 xfs_dahash_t hash=0; /* hash from this entry */
1550 xfs_dahash_t hashwant; /* hash value looking for */
1551 int high; /* high leaf index */
1552 int low; /* low leaf index */
1553 xfs_dir2_leaf_t *leaf; /* leaf structure */
1554 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1555 int mid=0; /* current leaf index */
1556 struct xfs_dir2_leaf_entry *ents;
1557 struct xfs_dir3_icleaf_hdr leafhdr;
1558
1559 leaf = lbp->b_addr;
1560 ents = args->dp->d_ops->leaf_ents_p(leaf);
1561 args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
1562
1563 /*
1564 * Note, the table cannot be empty, so we have to go through the loop.
1565 * Binary search the leaf entries looking for our hash value.
1566 */
1567 for (lep = ents, low = 0, high = leafhdr.count - 1,
1568 hashwant = args->hashval;
1569 low <= high; ) {
1570 mid = (low + high) >> 1;
1571 if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
1572 break;
1573 if (hash < hashwant)
1574 low = mid + 1;
1575 else
1576 high = mid - 1;
1577 }
1578 /*
1579 * Found one, back up through all the equal hash values.
1580 */
1581 if (hash == hashwant) {
1582 while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
1583 mid--;
1584 }
1585 }
1586 /*
1587 * Need to point to an entry higher than ours.
1588 */
1589 else if (hash < hashwant)
1590 mid++;
1591 return mid;
1592}
1593
1594/*
1595 * Trim off a trailing data block. We know it's empty since the leaf
1596 * freespace table says so.
1597 */
1598int /* error */
1599xfs_dir2_leaf_trim_data(
1600 xfs_da_args_t *args, /* operation arguments */
1601 struct xfs_buf *lbp, /* leaf buffer */
1602 xfs_dir2_db_t db) /* data block number */
1603{
1604 __be16 *bestsp; /* leaf bests table */
1605 struct xfs_buf *dbp; /* data block buffer */
1606 xfs_inode_t *dp; /* incore directory inode */
1607 int error; /* error return value */
1608 xfs_dir2_leaf_t *leaf; /* leaf structure */
1609 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1610 xfs_mount_t *mp; /* filesystem mount point */
1611 xfs_trans_t *tp; /* transaction pointer */
1612
1613 dp = args->dp;
1614 mp = dp->i_mount;
1615 tp = args->trans;
1616 /*
1617 * Read the offending data block. We need its buffer.
1618 */
1619 error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
1620 -1, &dbp);
1621 if (error)
1622 return error;
1623
1624 leaf = lbp->b_addr;
1625 ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
1626
1627#ifdef DEBUG
1628{
1629 struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
1630 struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
1631
1632 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
1633 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
1634 ASSERT(be16_to_cpu(bf[0].length) ==
1635 args->geo->blksize - dp->d_ops->data_entry_offset);
1636 ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
1637}
1638#endif
1639
1640 /*
1641 * Get rid of the data block.
1642 */
1643 if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
1644 ASSERT(error != -ENOSPC);
1645 xfs_trans_brelse(tp, dbp);
1646 return error;
1647 }
1648 /*
1649 * Eliminate the last bests entry from the table.
1650 */
1651 bestsp = xfs_dir2_leaf_bests_p(ltp);
1652 be32_add_cpu(&ltp->bestcount, -1);
1653 memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
1654 xfs_dir3_leaf_log_tail(args, lbp);
1655 xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
1656 return 0;
1657}
1658
1659static inline size_t
1660xfs_dir3_leaf_size(
1661 struct xfs_dir3_icleaf_hdr *hdr,
1662 int counts)
1663{
1664 int entries;
1665 int hdrsize;
1666
1667 entries = hdr->count - hdr->stale;
1668 if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
1669 hdr->magic == XFS_DIR2_LEAFN_MAGIC)
1670 hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
1671 else
1672 hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
1673
1674 return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
1675 + counts * sizeof(xfs_dir2_data_off_t)
1676 + sizeof(xfs_dir2_leaf_tail_t);
1677}
1678
1679/*
1680 * Convert node form directory to leaf form directory.
1681 * The root of the node form dir needs to already be a LEAFN block.
1682 * Just return if we can't do anything.
1683 */
1684int /* error */
1685xfs_dir2_node_to_leaf(
1686 xfs_da_state_t *state) /* directory operation state */
1687{
1688 xfs_da_args_t *args; /* operation arguments */
1689 xfs_inode_t *dp; /* incore directory inode */
1690 int error; /* error return code */
1691 struct xfs_buf *fbp; /* buffer for freespace block */
1692 xfs_fileoff_t fo; /* freespace file offset */
1693 xfs_dir2_free_t *free; /* freespace structure */
1694 struct xfs_buf *lbp; /* buffer for leaf block */
1695 xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */
1696 xfs_dir2_leaf_t *leaf; /* leaf structure */
1697 xfs_mount_t *mp; /* filesystem mount point */
1698 int rval; /* successful free trim? */
1699 xfs_trans_t *tp; /* transaction pointer */
1700 struct xfs_dir3_icleaf_hdr leafhdr;
1701 struct xfs_dir3_icfree_hdr freehdr;
1702
1703 /*
1704 * There's more than a leaf level in the btree, so there must
1705 * be multiple leafn blocks. Give up.
1706 */
1707 if (state->path.active > 1)
1708 return 0;
1709 args = state->args;
1710
1711 trace_xfs_dir2_node_to_leaf(args);
1712
1713 mp = state->mp;
1714 dp = args->dp;
1715 tp = args->trans;
1716 /*
1717 * Get the last offset in the file.
1718 */
1719 if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
1720 return error;
1721 }
1722 fo -= args->geo->fsbcount;
1723 /*
1724 * If there are freespace blocks other than the first one,
1725 * take this opportunity to remove trailing empty freespace blocks
1726 * that may have been left behind during no-space-reservation
1727 * operations.
1728 */
1729 while (fo > args->geo->freeblk) {
1730 if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
1731 return error;
1732 }
1733 if (rval)
1734 fo -= args->geo->fsbcount;
1735 else
1736 return 0;
1737 }
1738 /*
1739 * Now find the block just before the freespace block.
1740 */
1741 if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
1742 return error;
1743 }
1744 /*
1745 * If it's not the single leaf block, give up.
1746 */
1747 if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
1748 return 0;
1749 lbp = state->path.blk[0].bp;
1750 leaf = lbp->b_addr;
1751 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
1752
1753 ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
1754 leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
1755
1756 /*
1757 * Read the freespace block.
1758 */
1759 error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
1760 if (error)
1761 return error;
1762 free = fbp->b_addr;
1763 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1764
1765 ASSERT(!freehdr.firstdb);
1766
1767 /*
1768 * Now see if the leafn and free data will fit in a leaf1.
1769 * If not, release the buffer and give up.
1770 */
1771 if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
1772 xfs_trans_brelse(tp, fbp);
1773 return 0;
1774 }
1775
1776 /*
1777 * If the leaf has any stale entries in it, compress them out.
1778 */
1779 if (leafhdr.stale)
1780 xfs_dir3_leaf_compact(args, &leafhdr, lbp);
1781
1782 lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
1783 xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
1784 leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
1785 ? XFS_DIR2_LEAF1_MAGIC
1786 : XFS_DIR3_LEAF1_MAGIC;
1787
1788 /*
1789 * Set up the leaf tail from the freespace block.
1790 */
1791 ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
1792 ltp->bestcount = cpu_to_be32(freehdr.nvalid);
1793
1794 /*
1795 * Set up the leaf bests table.
1796 */
1797 memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
1798 freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
1799
1800 dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
1801 xfs_dir3_leaf_log_header(args, lbp);
1802 xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
1803 xfs_dir3_leaf_log_tail(args, lbp);
1804 xfs_dir3_leaf_check(dp, lbp);
1805
1806 /*
1807 * Get rid of the freespace block.
1808 */
1809 error = xfs_dir2_shrink_inode(args,
1810 xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
1811 fbp);
1812 if (error) {
1813 /*
1814 * This can't fail here because it can only happen when
1815 * punching out the middle of an extent, and this is an
1816 * isolated block.
1817 */
1818 ASSERT(error != -ENOSPC);
1819 return error;
1820 }
1821 fbp = NULL;
1822 /*
1823 * Now see if we can convert the single-leaf directory
1824 * down to a block form directory.
1825 * This routine always kills the dabuf for the leaf, so
1826 * eliminate it from the path.
1827 */
1828 error = xfs_dir2_leaf_to_block(args, lbp, NULL);
1829 state->path.blk[0].bp = NULL;
1830 return error;
1831}
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
new file mode 100644
index 000000000000..2ae6ac2c11ae
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -0,0 +1,2284 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27#include "xfs_da_format.h"
28#include "xfs_da_btree.h"
29#include "xfs_inode.h"
30#include "xfs_bmap.h"
31#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h"
33#include "xfs_error.h"
34#include "xfs_trace.h"
35#include "xfs_trans.h"
36#include "xfs_buf_item.h"
37#include "xfs_cksum.h"
38
39/*
40 * Function declarations.
41 */
42static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
43 int index);
44static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
45 xfs_da_state_blk_t *blk1,
46 xfs_da_state_blk_t *blk2);
47static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
48 int index, xfs_da_state_blk_t *dblk,
49 int *rval);
50static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
51 xfs_da_state_blk_t *fblk);
52
53/*
54 * Check internal consistency of a leafn block.
55 */
56#ifdef DEBUG
57#define xfs_dir3_leaf_check(dp, bp) \
58do { \
59 if (!xfs_dir3_leafn_check((dp), (bp))) \
60 ASSERT(0); \
61} while (0);
62
63static bool
64xfs_dir3_leafn_check(
65 struct xfs_inode *dp,
66 struct xfs_buf *bp)
67{
68 struct xfs_dir2_leaf *leaf = bp->b_addr;
69 struct xfs_dir3_icleaf_hdr leafhdr;
70
71 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
72
73 if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
74 struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
75 if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
76 return false;
77 } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
78 return false;
79
80 return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
81}
82#else
83#define xfs_dir3_leaf_check(dp, bp)
84#endif
85
86static bool
87xfs_dir3_free_verify(
88 struct xfs_buf *bp)
89{
90 struct xfs_mount *mp = bp->b_target->bt_mount;
91 struct xfs_dir2_free_hdr *hdr = bp->b_addr;
92
93 if (xfs_sb_version_hascrc(&mp->m_sb)) {
94 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
95
96 if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
97 return false;
98 if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
99 return false;
100 if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
101 return false;
102 } else {
103 if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
104 return false;
105 }
106
107 /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
108
109 return true;
110}
111
112static void
113xfs_dir3_free_read_verify(
114 struct xfs_buf *bp)
115{
116 struct xfs_mount *mp = bp->b_target->bt_mount;
117
118 if (xfs_sb_version_hascrc(&mp->m_sb) &&
119 !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
120 xfs_buf_ioerror(bp, -EFSBADCRC);
121 else if (!xfs_dir3_free_verify(bp))
122 xfs_buf_ioerror(bp, -EFSCORRUPTED);
123
124 if (bp->b_error)
125 xfs_verifier_error(bp);
126}
127
128static void
129xfs_dir3_free_write_verify(
130 struct xfs_buf *bp)
131{
132 struct xfs_mount *mp = bp->b_target->bt_mount;
133 struct xfs_buf_log_item *bip = bp->b_fspriv;
134 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
135
136 if (!xfs_dir3_free_verify(bp)) {
137 xfs_buf_ioerror(bp, -EFSCORRUPTED);
138 xfs_verifier_error(bp);
139 return;
140 }
141
142 if (!xfs_sb_version_hascrc(&mp->m_sb))
143 return;
144
145 if (bip)
146 hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
147
148 xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
149}
150
151const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
152 .verify_read = xfs_dir3_free_read_verify,
153 .verify_write = xfs_dir3_free_write_verify,
154};
155
156
157static int
158__xfs_dir3_free_read(
159 struct xfs_trans *tp,
160 struct xfs_inode *dp,
161 xfs_dablk_t fbno,
162 xfs_daddr_t mappedbno,
163 struct xfs_buf **bpp)
164{
165 int err;
166
167 err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
168 XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
169
170 /* try read returns without an error or *bpp if it lands in a hole */
171 if (!err && tp && *bpp)
172 xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
173 return err;
174}
175
176int
177xfs_dir2_free_read(
178 struct xfs_trans *tp,
179 struct xfs_inode *dp,
180 xfs_dablk_t fbno,
181 struct xfs_buf **bpp)
182{
183 return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
184}
185
186static int
187xfs_dir2_free_try_read(
188 struct xfs_trans *tp,
189 struct xfs_inode *dp,
190 xfs_dablk_t fbno,
191 struct xfs_buf **bpp)
192{
193 return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
194}
195
196static int
197xfs_dir3_free_get_buf(
198 xfs_da_args_t *args,
199 xfs_dir2_db_t fbno,
200 struct xfs_buf **bpp)
201{
202 struct xfs_trans *tp = args->trans;
203 struct xfs_inode *dp = args->dp;
204 struct xfs_mount *mp = dp->i_mount;
205 struct xfs_buf *bp;
206 int error;
207 struct xfs_dir3_icfree_hdr hdr;
208
209 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
210 -1, &bp, XFS_DATA_FORK);
211 if (error)
212 return error;
213
214 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
215 bp->b_ops = &xfs_dir3_free_buf_ops;
216
217 /*
218 * Initialize the new block to be empty, and remember
219 * its first slot as our empty slot.
220 */
221 memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
222 memset(&hdr, 0, sizeof(hdr));
223
224 if (xfs_sb_version_hascrc(&mp->m_sb)) {
225 struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
226
227 hdr.magic = XFS_DIR3_FREE_MAGIC;
228
229 hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
230 hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
231 uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
232 } else
233 hdr.magic = XFS_DIR2_FREE_MAGIC;
234 dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
235 *bpp = bp;
236 return 0;
237}
238
239/*
240 * Log entries from a freespace block.
241 */
242STATIC void
243xfs_dir2_free_log_bests(
244 struct xfs_da_args *args,
245 struct xfs_buf *bp,
246 int first, /* first entry to log */
247 int last) /* last entry to log */
248{
249 xfs_dir2_free_t *free; /* freespace structure */
250 __be16 *bests;
251
252 free = bp->b_addr;
253 bests = args->dp->d_ops->free_bests_p(free);
254 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
255 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
256 xfs_trans_log_buf(args->trans, bp,
257 (uint)((char *)&bests[first] - (char *)free),
258 (uint)((char *)&bests[last] - (char *)free +
259 sizeof(bests[0]) - 1));
260}
261
262/*
263 * Log header from a freespace block.
264 */
265static void
266xfs_dir2_free_log_header(
267 struct xfs_da_args *args,
268 struct xfs_buf *bp)
269{
270#ifdef DEBUG
271 xfs_dir2_free_t *free; /* freespace structure */
272
273 free = bp->b_addr;
274 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
275 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
276#endif
277 xfs_trans_log_buf(args->trans, bp, 0,
278 args->dp->d_ops->free_hdr_size - 1);
279}
280
281/*
282 * Convert a leaf-format directory to a node-format directory.
283 * We need to change the magic number of the leaf block, and copy
284 * the freespace table out of the leaf block into its own block.
285 */
286int /* error */
287xfs_dir2_leaf_to_node(
288 xfs_da_args_t *args, /* operation arguments */
289 struct xfs_buf *lbp) /* leaf buffer */
290{
291 xfs_inode_t *dp; /* incore directory inode */
292 int error; /* error return value */
293 struct xfs_buf *fbp; /* freespace buffer */
294 xfs_dir2_db_t fdb; /* freespace block number */
295 xfs_dir2_free_t *free; /* freespace structure */
296 __be16 *from; /* pointer to freespace entry */
297 int i; /* leaf freespace index */
298 xfs_dir2_leaf_t *leaf; /* leaf structure */
299 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
300 xfs_mount_t *mp; /* filesystem mount point */
301 int n; /* count of live freespc ents */
302 xfs_dir2_data_off_t off; /* freespace entry value */
303 __be16 *to; /* pointer to freespace entry */
304 xfs_trans_t *tp; /* transaction pointer */
305 struct xfs_dir3_icfree_hdr freehdr;
306
307 trace_xfs_dir2_leaf_to_node(args);
308
309 dp = args->dp;
310 mp = dp->i_mount;
311 tp = args->trans;
312 /*
313 * Add a freespace block to the directory.
314 */
315 if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
316 return error;
317 }
318 ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
319 /*
320 * Get the buffer for the new freespace block.
321 */
322 error = xfs_dir3_free_get_buf(args, fdb, &fbp);
323 if (error)
324 return error;
325
326 free = fbp->b_addr;
327 dp->d_ops->free_hdr_from_disk(&freehdr, free);
328 leaf = lbp->b_addr;
329 ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
330 ASSERT(be32_to_cpu(ltp->bestcount) <=
331 (uint)dp->i_d.di_size / args->geo->blksize);
332
333 /*
334 * Copy freespace entries from the leaf block to the new block.
335 * Count active entries.
336 */
337 from = xfs_dir2_leaf_bests_p(ltp);
338 to = dp->d_ops->free_bests_p(free);
339 for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
340 if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
341 n++;
342 *to = cpu_to_be16(off);
343 }
344
345 /*
346 * Now initialize the freespace block header.
347 */
348 freehdr.nused = n;
349 freehdr.nvalid = be32_to_cpu(ltp->bestcount);
350
351 dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
352 xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
353 xfs_dir2_free_log_header(args, fbp);
354
355 /*
356 * Converting the leaf to a leafnode is just a matter of changing the
357 * magic number and the ops. Do the change directly to the buffer as
358 * it's less work (and less code) than decoding the header to host
359 * format and back again.
360 */
361 if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
362 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
363 else
364 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
365 lbp->b_ops = &xfs_dir3_leafn_buf_ops;
366 xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
367 xfs_dir3_leaf_log_header(args, lbp);
368 xfs_dir3_leaf_check(dp, lbp);
369 return 0;
370}
371
372/*
373 * Add a leaf entry to a leaf block in a node-form directory.
374 * The other work necessary is done from the caller.
375 */
376static int /* error */
377xfs_dir2_leafn_add(
378 struct xfs_buf *bp, /* leaf buffer */
379 xfs_da_args_t *args, /* operation arguments */
380 int index) /* insertion pt for new entry */
381{
382 int compact; /* compacting stale leaves */
383 xfs_inode_t *dp; /* incore directory inode */
384 int highstale; /* next stale entry */
385 xfs_dir2_leaf_t *leaf; /* leaf structure */
386 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
387 int lfloghigh; /* high leaf entry logging */
388 int lfloglow; /* low leaf entry logging */
389 int lowstale; /* previous stale entry */
390 xfs_mount_t *mp; /* filesystem mount point */
391 xfs_trans_t *tp; /* transaction pointer */
392 struct xfs_dir3_icleaf_hdr leafhdr;
393 struct xfs_dir2_leaf_entry *ents;
394
395 trace_xfs_dir2_leafn_add(args, index);
396
397 dp = args->dp;
398 mp = dp->i_mount;
399 tp = args->trans;
400 leaf = bp->b_addr;
401 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
402 ents = dp->d_ops->leaf_ents_p(leaf);
403
404 /*
405 * Quick check just to make sure we are not going to index
406 * into other peoples memory
407 */
408 if (index < 0)
409 return -EFSCORRUPTED;
410
411 /*
412 * If there are already the maximum number of leaf entries in
413 * the block, if there are no stale entries it won't fit.
414 * Caller will do a split. If there are stale entries we'll do
415 * a compact.
416 */
417
418 if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
419 if (!leafhdr.stale)
420 return -ENOSPC;
421 compact = leafhdr.stale > 1;
422 } else
423 compact = 0;
424 ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
425 ASSERT(index == leafhdr.count ||
426 be32_to_cpu(ents[index].hashval) >= args->hashval);
427
428 if (args->op_flags & XFS_DA_OP_JUSTCHECK)
429 return 0;
430
431 /*
432 * Compact out all but one stale leaf entry. Leaves behind
433 * the entry closest to index.
434 */
435 if (compact)
436 xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
437 &highstale, &lfloglow, &lfloghigh);
438 else if (leafhdr.stale) {
439 /*
440 * Set impossible logging indices for this case.
441 */
442 lfloglow = leafhdr.count;
443 lfloghigh = -1;
444 }
445
446 /*
447 * Insert the new entry, log everything.
448 */
449 lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
450 highstale, &lfloglow, &lfloghigh);
451
452 lep->hashval = cpu_to_be32(args->hashval);
453 lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
454 args->blkno, args->index));
455
456 dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
457 xfs_dir3_leaf_log_header(args, bp);
458 xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
459 xfs_dir3_leaf_check(dp, bp);
460 return 0;
461}
462
463#ifdef DEBUG
464static void
465xfs_dir2_free_hdr_check(
466 struct xfs_inode *dp,
467 struct xfs_buf *bp,
468 xfs_dir2_db_t db)
469{
470 struct xfs_dir3_icfree_hdr hdr;
471
472 dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
473
474 ASSERT((hdr.firstdb %
475 dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
476 ASSERT(hdr.firstdb <= db);
477 ASSERT(db < hdr.firstdb + hdr.nvalid);
478}
479#else
480#define xfs_dir2_free_hdr_check(dp, bp, db)
481#endif /* DEBUG */
482
483/*
484 * Return the last hash value in the leaf.
485 * Stale entries are ok.
486 */
487xfs_dahash_t /* hash value */
488xfs_dir2_leafn_lasthash(
489 struct xfs_inode *dp,
490 struct xfs_buf *bp, /* leaf buffer */
491 int *count) /* count of entries in leaf */
492{
493 struct xfs_dir2_leaf *leaf = bp->b_addr;
494 struct xfs_dir2_leaf_entry *ents;
495 struct xfs_dir3_icleaf_hdr leafhdr;
496
497 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
498
499 ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
500 leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
501
502 if (count)
503 *count = leafhdr.count;
504 if (!leafhdr.count)
505 return 0;
506
507 ents = dp->d_ops->leaf_ents_p(leaf);
508 return be32_to_cpu(ents[leafhdr.count - 1].hashval);
509}
510
511/*
512 * Look up a leaf entry for space to add a name in a node-format leaf block.
513 * The extrablk in state is a freespace block.
514 */
515STATIC int
516xfs_dir2_leafn_lookup_for_addname(
517 struct xfs_buf *bp, /* leaf buffer */
518 xfs_da_args_t *args, /* operation arguments */
519 int *indexp, /* out: leaf entry index */
520 xfs_da_state_t *state) /* state to fill in */
521{
522 struct xfs_buf *curbp = NULL; /* current data/free buffer */
523 xfs_dir2_db_t curdb = -1; /* current data block number */
524 xfs_dir2_db_t curfdb = -1; /* current free block number */
525 xfs_inode_t *dp; /* incore directory inode */
526 int error; /* error return value */
527 int fi; /* free entry index */
528 xfs_dir2_free_t *free = NULL; /* free block structure */
529 int index; /* leaf entry index */
530 xfs_dir2_leaf_t *leaf; /* leaf structure */
531 int length; /* length of new data entry */
532 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
533 xfs_mount_t *mp; /* filesystem mount point */
534 xfs_dir2_db_t newdb; /* new data block number */
535 xfs_dir2_db_t newfdb; /* new free block number */
536 xfs_trans_t *tp; /* transaction pointer */
537 struct xfs_dir2_leaf_entry *ents;
538 struct xfs_dir3_icleaf_hdr leafhdr;
539
540 dp = args->dp;
541 tp = args->trans;
542 mp = dp->i_mount;
543 leaf = bp->b_addr;
544 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
545 ents = dp->d_ops->leaf_ents_p(leaf);
546
547 xfs_dir3_leaf_check(dp, bp);
548 ASSERT(leafhdr.count > 0);
549
550 /*
551 * Look up the hash value in the leaf entries.
552 */
553 index = xfs_dir2_leaf_search_hash(args, bp);
554 /*
555 * Do we have a buffer coming in?
556 */
557 if (state->extravalid) {
558 /* If so, it's a free block buffer, get the block number. */
559 curbp = state->extrablk.bp;
560 curfdb = state->extrablk.blkno;
561 free = curbp->b_addr;
562 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
563 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
564 }
565 length = dp->d_ops->data_entsize(args->namelen);
566 /*
567 * Loop over leaf entries with the right hash value.
568 */
569 for (lep = &ents[index];
570 index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
571 lep++, index++) {
572 /*
573 * Skip stale leaf entries.
574 */
575 if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
576 continue;
577 /*
578 * Pull the data block number from the entry.
579 */
580 newdb = xfs_dir2_dataptr_to_db(args->geo,
581 be32_to_cpu(lep->address));
582 /*
583 * For addname, we're looking for a place to put the new entry.
584 * We want to use a data block with an entry of equal
585 * hash value to ours if there is one with room.
586 *
587 * If this block isn't the data block we already have
588 * in hand, take a look at it.
589 */
590 if (newdb != curdb) {
591 __be16 *bests;
592
593 curdb = newdb;
594 /*
595 * Convert the data block to the free block
596 * holding its freespace information.
597 */
598 newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
599 /*
600 * If it's not the one we have in hand, read it in.
601 */
602 if (newfdb != curfdb) {
603 /*
604 * If we had one before, drop it.
605 */
606 if (curbp)
607 xfs_trans_brelse(tp, curbp);
608
609 error = xfs_dir2_free_read(tp, dp,
610 xfs_dir2_db_to_da(args->geo,
611 newfdb),
612 &curbp);
613 if (error)
614 return error;
615 free = curbp->b_addr;
616
617 xfs_dir2_free_hdr_check(dp, curbp, curdb);
618 }
619 /*
620 * Get the index for our entry.
621 */
622 fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
623 /*
624 * If it has room, return it.
625 */
626 bests = dp->d_ops->free_bests_p(free);
627 if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
628 XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
629 XFS_ERRLEVEL_LOW, mp);
630 if (curfdb != newfdb)
631 xfs_trans_brelse(tp, curbp);
632 return -EFSCORRUPTED;
633 }
634 curfdb = newfdb;
635 if (be16_to_cpu(bests[fi]) >= length)
636 goto out;
637 }
638 }
639 /* Didn't find any space */
640 fi = -1;
641out:
642 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
643 if (curbp) {
644 /* Giving back a free block. */
645 state->extravalid = 1;
646 state->extrablk.bp = curbp;
647 state->extrablk.index = fi;
648 state->extrablk.blkno = curfdb;
649
650 /*
651 * Important: this magic number is not in the buffer - it's for
652 * buffer type information and therefore only the free/data type
653 * matters here, not whether CRCs are enabled or not.
654 */
655 state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
656 } else {
657 state->extravalid = 0;
658 }
659 /*
660 * Return the index, that will be the insertion point.
661 */
662 *indexp = index;
663 return -ENOENT;
664}
665
666/*
667 * Look up a leaf entry in a node-format leaf block.
668 * The extrablk in state a data block.
669 */
670STATIC int
671xfs_dir2_leafn_lookup_for_entry(
672 struct xfs_buf *bp, /* leaf buffer */
673 xfs_da_args_t *args, /* operation arguments */
674 int *indexp, /* out: leaf entry index */
675 xfs_da_state_t *state) /* state to fill in */
676{
677 struct xfs_buf *curbp = NULL; /* current data/free buffer */
678 xfs_dir2_db_t curdb = -1; /* current data block number */
679 xfs_dir2_data_entry_t *dep; /* data block entry */
680 xfs_inode_t *dp; /* incore directory inode */
681 int error; /* error return value */
682 int index; /* leaf entry index */
683 xfs_dir2_leaf_t *leaf; /* leaf structure */
684 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
685 xfs_mount_t *mp; /* filesystem mount point */
686 xfs_dir2_db_t newdb; /* new data block number */
687 xfs_trans_t *tp; /* transaction pointer */
688 enum xfs_dacmp cmp; /* comparison result */
689 struct xfs_dir2_leaf_entry *ents;
690 struct xfs_dir3_icleaf_hdr leafhdr;
691
692 dp = args->dp;
693 tp = args->trans;
694 mp = dp->i_mount;
695 leaf = bp->b_addr;
696 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
697 ents = dp->d_ops->leaf_ents_p(leaf);
698
699 xfs_dir3_leaf_check(dp, bp);
700 ASSERT(leafhdr.count > 0);
701
702 /*
703 * Look up the hash value in the leaf entries.
704 */
705 index = xfs_dir2_leaf_search_hash(args, bp);
706 /*
707 * Do we have a buffer coming in?
708 */
709 if (state->extravalid) {
710 curbp = state->extrablk.bp;
711 curdb = state->extrablk.blkno;
712 }
713 /*
714 * Loop over leaf entries with the right hash value.
715 */
716 for (lep = &ents[index];
717 index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
718 lep++, index++) {
719 /*
720 * Skip stale leaf entries.
721 */
722 if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
723 continue;
724 /*
725 * Pull the data block number from the entry.
726 */
727 newdb = xfs_dir2_dataptr_to_db(args->geo,
728 be32_to_cpu(lep->address));
729 /*
730 * Not adding a new entry, so we really want to find
731 * the name given to us.
732 *
733 * If it's a different data block, go get it.
734 */
735 if (newdb != curdb) {
736 /*
737 * If we had a block before that we aren't saving
738 * for a CI name, drop it
739 */
740 if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
741 curdb != state->extrablk.blkno))
742 xfs_trans_brelse(tp, curbp);
743 /*
744 * If needing the block that is saved with a CI match,
745 * use it otherwise read in the new data block.
746 */
747 if (args->cmpresult != XFS_CMP_DIFFERENT &&
748 newdb == state->extrablk.blkno) {
749 ASSERT(state->extravalid);
750 curbp = state->extrablk.bp;
751 } else {
752 error = xfs_dir3_data_read(tp, dp,
753 xfs_dir2_db_to_da(args->geo,
754 newdb),
755 -1, &curbp);
756 if (error)
757 return error;
758 }
759 xfs_dir3_data_check(dp, curbp);
760 curdb = newdb;
761 }
762 /*
763 * Point to the data entry.
764 */
765 dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
766 xfs_dir2_dataptr_to_off(args->geo,
767 be32_to_cpu(lep->address)));
768 /*
769 * Compare the entry and if it's an exact match, return
770 * EEXIST immediately. If it's the first case-insensitive
771 * match, store the block & inode number and continue looking.
772 */
773 cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
774 if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
775 /* If there is a CI match block, drop it */
776 if (args->cmpresult != XFS_CMP_DIFFERENT &&
777 curdb != state->extrablk.blkno)
778 xfs_trans_brelse(tp, state->extrablk.bp);
779 args->cmpresult = cmp;
780 args->inumber = be64_to_cpu(dep->inumber);
781 args->filetype = dp->d_ops->data_get_ftype(dep);
782 *indexp = index;
783 state->extravalid = 1;
784 state->extrablk.bp = curbp;
785 state->extrablk.blkno = curdb;
786 state->extrablk.index = (int)((char *)dep -
787 (char *)curbp->b_addr);
788 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
789 curbp->b_ops = &xfs_dir3_data_buf_ops;
790 xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
791 if (cmp == XFS_CMP_EXACT)
792 return -EEXIST;
793 }
794 }
795 ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
796 if (curbp) {
797 if (args->cmpresult == XFS_CMP_DIFFERENT) {
798 /* Giving back last used data block. */
799 state->extravalid = 1;
800 state->extrablk.bp = curbp;
801 state->extrablk.index = -1;
802 state->extrablk.blkno = curdb;
803 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
804 curbp->b_ops = &xfs_dir3_data_buf_ops;
805 xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
806 } else {
807 /* If the curbp is not the CI match block, drop it */
808 if (state->extrablk.bp != curbp)
809 xfs_trans_brelse(tp, curbp);
810 }
811 } else {
812 state->extravalid = 0;
813 }
814 *indexp = index;
815 return -ENOENT;
816}
817
818/*
819 * Look up a leaf entry in a node-format leaf block.
820 * If this is an addname then the extrablk in state is a freespace block,
821 * otherwise it's a data block.
822 */
823int
824xfs_dir2_leafn_lookup_int(
825 struct xfs_buf *bp, /* leaf buffer */
826 xfs_da_args_t *args, /* operation arguments */
827 int *indexp, /* out: leaf entry index */
828 xfs_da_state_t *state) /* state to fill in */
829{
830 if (args->op_flags & XFS_DA_OP_ADDNAME)
831 return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
832 state);
833 return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
834}
835
836/*
837 * Move count leaf entries from source to destination leaf.
838 * Log entries and headers. Stale entries are preserved.
839 */
840static void
841xfs_dir3_leafn_moveents(
842 xfs_da_args_t *args, /* operation arguments */
843 struct xfs_buf *bp_s, /* source */
844 struct xfs_dir3_icleaf_hdr *shdr,
845 struct xfs_dir2_leaf_entry *sents,
846 int start_s,/* source leaf index */
847 struct xfs_buf *bp_d, /* destination */
848 struct xfs_dir3_icleaf_hdr *dhdr,
849 struct xfs_dir2_leaf_entry *dents,
850 int start_d,/* destination leaf index */
851 int count) /* count of leaves to copy */
852{
853 int stale; /* count stale leaves copied */
854
855 trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
856
857 /*
858 * Silently return if nothing to do.
859 */
860 if (count == 0)
861 return;
862
863 /*
864 * If the destination index is not the end of the current
865 * destination leaf entries, open up a hole in the destination
866 * to hold the new entries.
867 */
868 if (start_d < dhdr->count) {
869 memmove(&dents[start_d + count], &dents[start_d],
870 (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
871 xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
872 count + dhdr->count - 1);
873 }
874 /*
875 * If the source has stale leaves, count the ones in the copy range
876 * so we can update the header correctly.
877 */
878 if (shdr->stale) {
879 int i; /* temp leaf index */
880
881 for (i = start_s, stale = 0; i < start_s + count; i++) {
882 if (sents[i].address ==
883 cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
884 stale++;
885 }
886 } else
887 stale = 0;
888 /*
889 * Copy the leaf entries from source to destination.
890 */
891 memcpy(&dents[start_d], &sents[start_s],
892 count * sizeof(xfs_dir2_leaf_entry_t));
893 xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
894
895 /*
896 * If there are source entries after the ones we copied,
897 * delete the ones we copied by sliding the next ones down.
898 */
899 if (start_s + count < shdr->count) {
900 memmove(&sents[start_s], &sents[start_s + count],
901 count * sizeof(xfs_dir2_leaf_entry_t));
902 xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
903 }
904
905 /*
906 * Update the headers and log them.
907 */
908 shdr->count -= count;
909 shdr->stale -= stale;
910 dhdr->count += count;
911 dhdr->stale += stale;
912}
913
914/*
915 * Determine the sort order of two leaf blocks.
916 * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
917 */
918int /* sort order */
919xfs_dir2_leafn_order(
920 struct xfs_inode *dp,
921 struct xfs_buf *leaf1_bp, /* leaf1 buffer */
922 struct xfs_buf *leaf2_bp) /* leaf2 buffer */
923{
924 struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr;
925 struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr;
926 struct xfs_dir2_leaf_entry *ents1;
927 struct xfs_dir2_leaf_entry *ents2;
928 struct xfs_dir3_icleaf_hdr hdr1;
929 struct xfs_dir3_icleaf_hdr hdr2;
930
931 dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
932 dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
933 ents1 = dp->d_ops->leaf_ents_p(leaf1);
934 ents2 = dp->d_ops->leaf_ents_p(leaf2);
935
936 if (hdr1.count > 0 && hdr2.count > 0 &&
937 (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
938 be32_to_cpu(ents2[hdr2.count - 1].hashval) <
939 be32_to_cpu(ents1[hdr1.count - 1].hashval)))
940 return 1;
941 return 0;
942}
943
944/*
945 * Rebalance leaf entries between two leaf blocks.
946 * This is actually only called when the second block is new,
947 * though the code deals with the general case.
948 * A new entry will be inserted in one of the blocks, and that
949 * entry is taken into account when balancing.
950 */
951static void
952xfs_dir2_leafn_rebalance(
953 xfs_da_state_t *state, /* btree cursor */
954 xfs_da_state_blk_t *blk1, /* first btree block */
955 xfs_da_state_blk_t *blk2) /* second btree block */
956{
957 xfs_da_args_t *args; /* operation arguments */
958 int count; /* count (& direction) leaves */
959 int isleft; /* new goes in left leaf */
960 xfs_dir2_leaf_t *leaf1; /* first leaf structure */
961 xfs_dir2_leaf_t *leaf2; /* second leaf structure */
962 int mid; /* midpoint leaf index */
963#if defined(DEBUG) || defined(XFS_WARN)
964 int oldstale; /* old count of stale leaves */
965#endif
966 int oldsum; /* old total leaf count */
967 int swap; /* swapped leaf blocks */
968 struct xfs_dir2_leaf_entry *ents1;
969 struct xfs_dir2_leaf_entry *ents2;
970 struct xfs_dir3_icleaf_hdr hdr1;
971 struct xfs_dir3_icleaf_hdr hdr2;
972 struct xfs_inode *dp = state->args->dp;
973
974 args = state->args;
975 /*
976 * If the block order is wrong, swap the arguments.
977 */
978 if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
979 xfs_da_state_blk_t *tmp; /* temp for block swap */
980
981 tmp = blk1;
982 blk1 = blk2;
983 blk2 = tmp;
984 }
985 leaf1 = blk1->bp->b_addr;
986 leaf2 = blk2->bp->b_addr;
987 dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
988 dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
989 ents1 = dp->d_ops->leaf_ents_p(leaf1);
990 ents2 = dp->d_ops->leaf_ents_p(leaf2);
991
992 oldsum = hdr1.count + hdr2.count;
993#if defined(DEBUG) || defined(XFS_WARN)
994 oldstale = hdr1.stale + hdr2.stale;
995#endif
996 mid = oldsum >> 1;
997
998 /*
999 * If the old leaf count was odd then the new one will be even,
1000 * so we need to divide the new count evenly.
1001 */
1002 if (oldsum & 1) {
1003 xfs_dahash_t midhash; /* middle entry hash value */
1004
1005 if (mid >= hdr1.count)
1006 midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
1007 else
1008 midhash = be32_to_cpu(ents1[mid].hashval);
1009 isleft = args->hashval <= midhash;
1010 }
1011 /*
1012 * If the old count is even then the new count is odd, so there's
1013 * no preferred side for the new entry.
1014 * Pick the left one.
1015 */
1016 else
1017 isleft = 1;
1018 /*
1019 * Calculate moved entry count. Positive means left-to-right,
1020 * negative means right-to-left. Then move the entries.
1021 */
1022 count = hdr1.count - mid + (isleft == 0);
1023 if (count > 0)
1024 xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
1025 hdr1.count - count, blk2->bp,
1026 &hdr2, ents2, 0, count);
1027 else if (count < 0)
1028 xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
1029 blk1->bp, &hdr1, ents1,
1030 hdr1.count, count);
1031
1032 ASSERT(hdr1.count + hdr2.count == oldsum);
1033 ASSERT(hdr1.stale + hdr2.stale == oldstale);
1034
1035 /* log the changes made when moving the entries */
1036 dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
1037 dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
1038 xfs_dir3_leaf_log_header(args, blk1->bp);
1039 xfs_dir3_leaf_log_header(args, blk2->bp);
1040
1041 xfs_dir3_leaf_check(dp, blk1->bp);
1042 xfs_dir3_leaf_check(dp, blk2->bp);
1043
1044 /*
1045 * Mark whether we're inserting into the old or new leaf.
1046 */
1047 if (hdr1.count < hdr2.count)
1048 state->inleaf = swap;
1049 else if (hdr1.count > hdr2.count)
1050 state->inleaf = !swap;
1051 else
1052 state->inleaf = swap ^ (blk1->index <= hdr1.count);
1053 /*
1054 * Adjust the expected index for insertion.
1055 */
1056 if (!state->inleaf)
1057 blk2->index = blk1->index - hdr1.count;
1058
1059 /*
1060 * Finally sanity check just to make sure we are not returning a
1061 * negative index
1062 */
1063 if (blk2->index < 0) {
1064 state->inleaf = 1;
1065 blk2->index = 0;
1066 xfs_alert(dp->i_mount,
1067 "%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
1068 __func__, blk1->index);
1069 }
1070}
1071
1072static int
1073xfs_dir3_data_block_free(
1074 xfs_da_args_t *args,
1075 struct xfs_dir2_data_hdr *hdr,
1076 struct xfs_dir2_free *free,
1077 xfs_dir2_db_t fdb,
1078 int findex,
1079 struct xfs_buf *fbp,
1080 int longest)
1081{
1082 int logfree = 0;
1083 __be16 *bests;
1084 struct xfs_dir3_icfree_hdr freehdr;
1085 struct xfs_inode *dp = args->dp;
1086
1087 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1088 bests = dp->d_ops->free_bests_p(free);
1089 if (hdr) {
1090 /*
1091 * Data block is not empty, just set the free entry to the new
1092 * value.
1093 */
1094 bests[findex] = cpu_to_be16(longest);
1095 xfs_dir2_free_log_bests(args, fbp, findex, findex);
1096 return 0;
1097 }
1098
1099 /* One less used entry in the free table. */
1100 freehdr.nused--;
1101
1102 /*
1103 * If this was the last entry in the table, we can trim the table size
1104 * back. There might be other entries at the end referring to
1105 * non-existent data blocks, get those too.
1106 */
1107 if (findex == freehdr.nvalid - 1) {
1108 int i; /* free entry index */
1109
1110 for (i = findex - 1; i >= 0; i--) {
1111 if (bests[i] != cpu_to_be16(NULLDATAOFF))
1112 break;
1113 }
1114 freehdr.nvalid = i + 1;
1115 logfree = 0;
1116 } else {
1117 /* Not the last entry, just punch it out. */
1118 bests[findex] = cpu_to_be16(NULLDATAOFF);
1119 logfree = 1;
1120 }
1121
1122 dp->d_ops->free_hdr_to_disk(free, &freehdr);
1123 xfs_dir2_free_log_header(args, fbp);
1124
1125 /*
1126 * If there are no useful entries left in the block, get rid of the
1127 * block if we can.
1128 */
1129 if (!freehdr.nused) {
1130 int error;
1131
1132 error = xfs_dir2_shrink_inode(args, fdb, fbp);
1133 if (error == 0) {
1134 fbp = NULL;
1135 logfree = 0;
1136 } else if (error != -ENOSPC || args->total != 0)
1137 return error;
1138 /*
1139 * It's possible to get ENOSPC if there is no
1140 * space reservation. In this case some one
1141 * else will eventually get rid of this block.
1142 */
1143 }
1144
1145 /* Log the free entry that changed, unless we got rid of it. */
1146 if (logfree)
1147 xfs_dir2_free_log_bests(args, fbp, findex, findex);
1148 return 0;
1149}
1150
1151/*
1152 * Remove an entry from a node directory.
1153 * This removes the leaf entry and the data entry,
1154 * and updates the free block if necessary.
1155 */
1156static int /* error */
1157xfs_dir2_leafn_remove(
1158 xfs_da_args_t *args, /* operation arguments */
1159 struct xfs_buf *bp, /* leaf buffer */
1160 int index, /* leaf entry index */
1161 xfs_da_state_blk_t *dblk, /* data block */
1162 int *rval) /* resulting block needs join */
1163{
1164 xfs_dir2_data_hdr_t *hdr; /* data block header */
1165 xfs_dir2_db_t db; /* data block number */
1166 struct xfs_buf *dbp; /* data block buffer */
1167 xfs_dir2_data_entry_t *dep; /* data block entry */
1168 xfs_inode_t *dp; /* incore directory inode */
1169 xfs_dir2_leaf_t *leaf; /* leaf structure */
1170 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1171 int longest; /* longest data free entry */
1172 int off; /* data block entry offset */
1173 xfs_mount_t *mp; /* filesystem mount point */
1174 int needlog; /* need to log data header */
1175 int needscan; /* need to rescan data frees */
1176 xfs_trans_t *tp; /* transaction pointer */
1177 struct xfs_dir2_data_free *bf; /* bestfree table */
1178 struct xfs_dir3_icleaf_hdr leafhdr;
1179 struct xfs_dir2_leaf_entry *ents;
1180
1181 trace_xfs_dir2_leafn_remove(args, index);
1182
1183 dp = args->dp;
1184 tp = args->trans;
1185 mp = dp->i_mount;
1186 leaf = bp->b_addr;
1187 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
1188 ents = dp->d_ops->leaf_ents_p(leaf);
1189
1190 /*
1191 * Point to the entry we're removing.
1192 */
1193 lep = &ents[index];
1194
1195 /*
1196 * Extract the data block and offset from the entry.
1197 */
1198 db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
1199 ASSERT(dblk->blkno == db);
1200 off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
1201 ASSERT(dblk->index == off);
1202
1203 /*
1204 * Kill the leaf entry by marking it stale.
1205 * Log the leaf block changes.
1206 */
1207 leafhdr.stale++;
1208 dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
1209 xfs_dir3_leaf_log_header(args, bp);
1210
1211 lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
1212 xfs_dir3_leaf_log_ents(args, bp, index, index);
1213
1214 /*
1215 * Make the data entry free. Keep track of the longest freespace
1216 * in the data block in case it changes.
1217 */
1218 dbp = dblk->bp;
1219 hdr = dbp->b_addr;
1220 dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
1221 bf = dp->d_ops->data_bestfree_p(hdr);
1222 longest = be16_to_cpu(bf[0].length);
1223 needlog = needscan = 0;
1224 xfs_dir2_data_make_free(args, dbp, off,
1225 dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
1226 /*
1227 * Rescan the data block freespaces for bestfree.
1228 * Log the data block header if needed.
1229 */
1230 if (needscan)
1231 xfs_dir2_data_freescan(dp, hdr, &needlog);
1232 if (needlog)
1233 xfs_dir2_data_log_header(args, dbp);
1234 xfs_dir3_data_check(dp, dbp);
1235 /*
1236 * If the longest data block freespace changes, need to update
1237 * the corresponding freeblock entry.
1238 */
1239 if (longest < be16_to_cpu(bf[0].length)) {
1240 int error; /* error return value */
1241 struct xfs_buf *fbp; /* freeblock buffer */
1242 xfs_dir2_db_t fdb; /* freeblock block number */
1243 int findex; /* index in freeblock entries */
1244 xfs_dir2_free_t *free; /* freeblock structure */
1245
1246 /*
1247 * Convert the data block number to a free block,
1248 * read in the free block.
1249 */
1250 fdb = dp->d_ops->db_to_fdb(args->geo, db);
1251 error = xfs_dir2_free_read(tp, dp,
1252 xfs_dir2_db_to_da(args->geo, fdb),
1253 &fbp);
1254 if (error)
1255 return error;
1256 free = fbp->b_addr;
1257#ifdef DEBUG
1258 {
1259 struct xfs_dir3_icfree_hdr freehdr;
1260 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1261 ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
1262 (fdb - xfs_dir2_byte_to_db(args->geo,
1263 XFS_DIR2_FREE_OFFSET)));
1264 }
1265#endif
1266 /*
1267 * Calculate which entry we need to fix.
1268 */
1269 findex = dp->d_ops->db_to_fdindex(args->geo, db);
1270 longest = be16_to_cpu(bf[0].length);
1271 /*
1272 * If the data block is now empty we can get rid of it
1273 * (usually).
1274 */
1275 if (longest == args->geo->blksize -
1276 dp->d_ops->data_entry_offset) {
1277 /*
1278 * Try to punch out the data block.
1279 */
1280 error = xfs_dir2_shrink_inode(args, db, dbp);
1281 if (error == 0) {
1282 dblk->bp = NULL;
1283 hdr = NULL;
1284 }
1285 /*
1286 * We can get ENOSPC if there's no space reservation.
1287 * In this case just drop the buffer and some one else
1288 * will eventually get rid of the empty block.
1289 */
1290 else if (!(error == -ENOSPC && args->total == 0))
1291 return error;
1292 }
1293 /*
1294 * If we got rid of the data block, we can eliminate that entry
1295 * in the free block.
1296 */
1297 error = xfs_dir3_data_block_free(args, hdr, free,
1298 fdb, findex, fbp, longest);
1299 if (error)
1300 return error;
1301 }
1302
1303 xfs_dir3_leaf_check(dp, bp);
1304 /*
1305 * Return indication of whether this leaf block is empty enough
1306 * to justify trying to join it with a neighbor.
1307 */
1308 *rval = (dp->d_ops->leaf_hdr_size +
1309 (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
1310 args->geo->magicpct;
1311 return 0;
1312}
1313
1314/*
1315 * Split the leaf entries in the old block into old and new blocks.
1316 */
1317int /* error */
1318xfs_dir2_leafn_split(
1319 xfs_da_state_t *state, /* btree cursor */
1320 xfs_da_state_blk_t *oldblk, /* original block */
1321 xfs_da_state_blk_t *newblk) /* newly created block */
1322{
1323 xfs_da_args_t *args; /* operation arguments */
1324 xfs_dablk_t blkno; /* new leaf block number */
1325 int error; /* error return value */
1326 xfs_mount_t *mp; /* filesystem mount point */
1327 struct xfs_inode *dp;
1328
1329 /*
1330 * Allocate space for a new leaf node.
1331 */
1332 args = state->args;
1333 dp = args->dp;
1334 mp = dp->i_mount;
1335 ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
1336 error = xfs_da_grow_inode(args, &blkno);
1337 if (error) {
1338 return error;
1339 }
1340 /*
1341 * Initialize the new leaf block.
1342 */
1343 error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
1344 &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
1345 if (error)
1346 return error;
1347
1348 newblk->blkno = blkno;
1349 newblk->magic = XFS_DIR2_LEAFN_MAGIC;
1350 /*
1351 * Rebalance the entries across the two leaves, link the new
1352 * block into the leaves.
1353 */
1354 xfs_dir2_leafn_rebalance(state, oldblk, newblk);
1355 error = xfs_da3_blk_link(state, oldblk, newblk);
1356 if (error) {
1357 return error;
1358 }
1359 /*
1360 * Insert the new entry in the correct block.
1361 */
1362 if (state->inleaf)
1363 error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
1364 else
1365 error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
1366 /*
1367 * Update last hashval in each block since we added the name.
1368 */
1369 oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
1370 newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
1371 xfs_dir3_leaf_check(dp, oldblk->bp);
1372 xfs_dir3_leaf_check(dp, newblk->bp);
1373 return error;
1374}
1375
1376/*
1377 * Check a leaf block and its neighbors to see if the block should be
1378 * collapsed into one or the other neighbor. Always keep the block
1379 * with the smaller block number.
1380 * If the current block is over 50% full, don't try to join it, return 0.
1381 * If the block is empty, fill in the state structure and return 2.
1382 * If it can be collapsed, fill in the state structure and return 1.
1383 * If nothing can be done, return 0.
1384 */
1385int /* error */
1386xfs_dir2_leafn_toosmall(
1387 xfs_da_state_t *state, /* btree cursor */
1388 int *action) /* resulting action to take */
1389{
1390 xfs_da_state_blk_t *blk; /* leaf block */
1391 xfs_dablk_t blkno; /* leaf block number */
1392 struct xfs_buf *bp; /* leaf buffer */
1393 int bytes; /* bytes in use */
1394 int count; /* leaf live entry count */
1395 int error; /* error return value */
1396 int forward; /* sibling block direction */
1397 int i; /* sibling counter */
1398 xfs_dir2_leaf_t *leaf; /* leaf structure */
1399 int rval; /* result from path_shift */
1400 struct xfs_dir3_icleaf_hdr leafhdr;
1401 struct xfs_dir2_leaf_entry *ents;
1402 struct xfs_inode *dp = state->args->dp;
1403
1404 /*
1405 * Check for the degenerate case of the block being over 50% full.
1406 * If so, it's not worth even looking to see if we might be able
1407 * to coalesce with a sibling.
1408 */
1409 blk = &state->path.blk[state->path.active - 1];
1410 leaf = blk->bp->b_addr;
1411 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
1412 ents = dp->d_ops->leaf_ents_p(leaf);
1413 xfs_dir3_leaf_check(dp, blk->bp);
1414
1415 count = leafhdr.count - leafhdr.stale;
1416 bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
1417 if (bytes > (state->args->geo->blksize >> 1)) {
1418 /*
1419 * Blk over 50%, don't try to join.
1420 */
1421 *action = 0;
1422 return 0;
1423 }
1424 /*
1425 * Check for the degenerate case of the block being empty.
1426 * If the block is empty, we'll simply delete it, no need to
1427 * coalesce it with a sibling block. We choose (arbitrarily)
1428 * to merge with the forward block unless it is NULL.
1429 */
1430 if (count == 0) {
1431 /*
1432 * Make altpath point to the block we want to keep and
1433 * path point to the block we want to drop (this one).
1434 */
1435 forward = (leafhdr.forw != 0);
1436 memcpy(&state->altpath, &state->path, sizeof(state->path));
1437 error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
1438 &rval);
1439 if (error)
1440 return error;
1441 *action = rval ? 2 : 0;
1442 return 0;
1443 }
1444 /*
1445 * Examine each sibling block to see if we can coalesce with
1446 * at least 25% free space to spare. We need to figure out
1447 * whether to merge with the forward or the backward block.
1448 * We prefer coalescing with the lower numbered sibling so as
1449 * to shrink a directory over time.
1450 */
1451 forward = leafhdr.forw < leafhdr.back;
1452 for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
1453 struct xfs_dir3_icleaf_hdr hdr2;
1454
1455 blkno = forward ? leafhdr.forw : leafhdr.back;
1456 if (blkno == 0)
1457 continue;
1458 /*
1459 * Read the sibling leaf block.
1460 */
1461 error = xfs_dir3_leafn_read(state->args->trans, dp,
1462 blkno, -1, &bp);
1463 if (error)
1464 return error;
1465
1466 /*
1467 * Count bytes in the two blocks combined.
1468 */
1469 count = leafhdr.count - leafhdr.stale;
1470 bytes = state->args->geo->blksize -
1471 (state->args->geo->blksize >> 2);
1472
1473 leaf = bp->b_addr;
1474 dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
1475 ents = dp->d_ops->leaf_ents_p(leaf);
1476 count += hdr2.count - hdr2.stale;
1477 bytes -= count * sizeof(ents[0]);
1478
1479 /*
1480 * Fits with at least 25% to spare.
1481 */
1482 if (bytes >= 0)
1483 break;
1484 xfs_trans_brelse(state->args->trans, bp);
1485 }
1486 /*
1487 * Didn't like either block, give up.
1488 */
1489 if (i >= 2) {
1490 *action = 0;
1491 return 0;
1492 }
1493
1494 /*
1495 * Make altpath point to the block we want to keep (the lower
1496 * numbered block) and path point to the block we want to drop.
1497 */
1498 memcpy(&state->altpath, &state->path, sizeof(state->path));
1499 if (blkno < blk->blkno)
1500 error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
1501 &rval);
1502 else
1503 error = xfs_da3_path_shift(state, &state->path, forward, 0,
1504 &rval);
1505 if (error) {
1506 return error;
1507 }
1508 *action = rval ? 0 : 1;
1509 return 0;
1510}
1511
1512/*
1513 * Move all the leaf entries from drop_blk to save_blk.
1514 * This is done as part of a join operation.
1515 */
1516void
1517xfs_dir2_leafn_unbalance(
1518 xfs_da_state_t *state, /* cursor */
1519 xfs_da_state_blk_t *drop_blk, /* dead block */
1520 xfs_da_state_blk_t *save_blk) /* surviving block */
1521{
1522 xfs_da_args_t *args; /* operation arguments */
1523 xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */
1524 xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */
1525 struct xfs_dir3_icleaf_hdr savehdr;
1526 struct xfs_dir3_icleaf_hdr drophdr;
1527 struct xfs_dir2_leaf_entry *sents;
1528 struct xfs_dir2_leaf_entry *dents;
1529 struct xfs_inode *dp = state->args->dp;
1530
1531 args = state->args;
1532 ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
1533 ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
1534 drop_leaf = drop_blk->bp->b_addr;
1535 save_leaf = save_blk->bp->b_addr;
1536
1537 dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
1538 dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
1539 sents = dp->d_ops->leaf_ents_p(save_leaf);
1540 dents = dp->d_ops->leaf_ents_p(drop_leaf);
1541
1542 /*
1543 * If there are any stale leaf entries, take this opportunity
1544 * to purge them.
1545 */
1546 if (drophdr.stale)
1547 xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
1548 if (savehdr.stale)
1549 xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
1550
1551 /*
1552 * Move the entries from drop to the appropriate end of save.
1553 */
1554 drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
1555 if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
1556 xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
1557 save_blk->bp, &savehdr, sents, 0,
1558 drophdr.count);
1559 else
1560 xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
1561 save_blk->bp, &savehdr, sents,
1562 savehdr.count, drophdr.count);
1563 save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
1564
1565 /* log the changes made when moving the entries */
1566 dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
1567 dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
1568 xfs_dir3_leaf_log_header(args, save_blk->bp);
1569 xfs_dir3_leaf_log_header(args, drop_blk->bp);
1570
1571 xfs_dir3_leaf_check(dp, save_blk->bp);
1572 xfs_dir3_leaf_check(dp, drop_blk->bp);
1573}
1574
1575/*
1576 * Top-level node form directory addname routine.
1577 */
1578int /* error */
1579xfs_dir2_node_addname(
1580 xfs_da_args_t *args) /* operation arguments */
1581{
1582 xfs_da_state_blk_t *blk; /* leaf block for insert */
1583 int error; /* error return value */
1584 int rval; /* sub-return value */
1585 xfs_da_state_t *state; /* btree cursor */
1586
1587 trace_xfs_dir2_node_addname(args);
1588
1589 /*
1590 * Allocate and initialize the state (btree cursor).
1591 */
1592 state = xfs_da_state_alloc();
1593 state->args = args;
1594 state->mp = args->dp->i_mount;
1595 /*
1596 * Look up the name. We're not supposed to find it, but
1597 * this gives us the insertion point.
1598 */
1599 error = xfs_da3_node_lookup_int(state, &rval);
1600 if (error)
1601 rval = error;
1602 if (rval != -ENOENT) {
1603 goto done;
1604 }
1605 /*
1606 * Add the data entry to a data block.
1607 * Extravalid is set to a freeblock found by lookup.
1608 */
1609 rval = xfs_dir2_node_addname_int(args,
1610 state->extravalid ? &state->extrablk : NULL);
1611 if (rval) {
1612 goto done;
1613 }
1614 blk = &state->path.blk[state->path.active - 1];
1615 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
1616 /*
1617 * Add the new leaf entry.
1618 */
1619 rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
1620 if (rval == 0) {
1621 /*
1622 * It worked, fix the hash values up the btree.
1623 */
1624 if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
1625 xfs_da3_fixhashpath(state, &state->path);
1626 } else {
1627 /*
1628 * It didn't work, we need to split the leaf block.
1629 */
1630 if (args->total == 0) {
1631 ASSERT(rval == -ENOSPC);
1632 goto done;
1633 }
1634 /*
1635 * Split the leaf block and insert the new entry.
1636 */
1637 rval = xfs_da3_split(state);
1638 }
1639done:
1640 xfs_da_state_free(state);
1641 return rval;
1642}
1643
1644/*
1645 * Add the data entry for a node-format directory name addition.
1646 * The leaf entry is added in xfs_dir2_leafn_add.
1647 * We may enter with a freespace block that the lookup found.
1648 */
1649static int /* error */
1650xfs_dir2_node_addname_int(
1651 xfs_da_args_t *args, /* operation arguments */
1652 xfs_da_state_blk_t *fblk) /* optional freespace block */
1653{
1654 xfs_dir2_data_hdr_t *hdr; /* data block header */
1655 xfs_dir2_db_t dbno; /* data block number */
1656 struct xfs_buf *dbp; /* data block buffer */
1657 xfs_dir2_data_entry_t *dep; /* data entry pointer */
1658 xfs_inode_t *dp; /* incore directory inode */
1659 xfs_dir2_data_unused_t *dup; /* data unused entry pointer */
1660 int error; /* error return value */
1661 xfs_dir2_db_t fbno; /* freespace block number */
1662 struct xfs_buf *fbp; /* freespace buffer */
1663 int findex; /* freespace entry index */
1664 xfs_dir2_free_t *free=NULL; /* freespace block structure */
1665 xfs_dir2_db_t ifbno; /* initial freespace block no */
1666 xfs_dir2_db_t lastfbno=0; /* highest freespace block no */
1667 int length; /* length of the new entry */
1668 int logfree; /* need to log free entry */
1669 xfs_mount_t *mp; /* filesystem mount point */
1670 int needlog; /* need to log data header */
1671 int needscan; /* need to rescan data frees */
1672 __be16 *tagp; /* data entry tag pointer */
1673 xfs_trans_t *tp; /* transaction pointer */
1674 __be16 *bests;
1675 struct xfs_dir3_icfree_hdr freehdr;
1676 struct xfs_dir2_data_free *bf;
1677
1678 dp = args->dp;
1679 mp = dp->i_mount;
1680 tp = args->trans;
1681 length = dp->d_ops->data_entsize(args->namelen);
1682 /*
1683 * If we came in with a freespace block that means that lookup
1684 * found an entry with our hash value. This is the freespace
1685 * block for that data entry.
1686 */
1687 if (fblk) {
1688 fbp = fblk->bp;
1689 /*
1690 * Remember initial freespace block number.
1691 */
1692 ifbno = fblk->blkno;
1693 free = fbp->b_addr;
1694 findex = fblk->index;
1695 bests = dp->d_ops->free_bests_p(free);
1696 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1697
1698 /*
1699 * This means the free entry showed that the data block had
1700 * space for our entry, so we remembered it.
1701 * Use that data block.
1702 */
1703 if (findex >= 0) {
1704 ASSERT(findex < freehdr.nvalid);
1705 ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
1706 ASSERT(be16_to_cpu(bests[findex]) >= length);
1707 dbno = freehdr.firstdb + findex;
1708 } else {
1709 /*
1710 * The data block looked at didn't have enough room.
1711 * We'll start at the beginning of the freespace entries.
1712 */
1713 dbno = -1;
1714 findex = 0;
1715 }
1716 } else {
1717 /*
1718 * Didn't come in with a freespace block, so no data block.
1719 */
1720 ifbno = dbno = -1;
1721 fbp = NULL;
1722 findex = 0;
1723 }
1724
1725 /*
1726 * If we don't have a data block yet, we're going to scan the
1727 * freespace blocks looking for one. Figure out what the
1728 * highest freespace block number is.
1729 */
1730 if (dbno == -1) {
1731 xfs_fileoff_t fo; /* freespace block number */
1732
1733 if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
1734 return error;
1735 lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
1736 fbno = ifbno;
1737 }
1738 /*
1739 * While we haven't identified a data block, search the freeblock
1740 * data for a good data block. If we find a null freeblock entry,
1741 * indicating a hole in the data blocks, remember that.
1742 */
1743 while (dbno == -1) {
1744 /*
1745 * If we don't have a freeblock in hand, get the next one.
1746 */
1747 if (fbp == NULL) {
1748 /*
1749 * Happens the first time through unless lookup gave
1750 * us a freespace block to start with.
1751 */
1752 if (++fbno == 0)
1753 fbno = xfs_dir2_byte_to_db(args->geo,
1754 XFS_DIR2_FREE_OFFSET);
1755 /*
1756 * If it's ifbno we already looked at it.
1757 */
1758 if (fbno == ifbno)
1759 fbno++;
1760 /*
1761 * If it's off the end we're done.
1762 */
1763 if (fbno >= lastfbno)
1764 break;
1765 /*
1766 * Read the block. There can be holes in the
1767 * freespace blocks, so this might not succeed.
1768 * This should be really rare, so there's no reason
1769 * to avoid it.
1770 */
1771 error = xfs_dir2_free_try_read(tp, dp,
1772 xfs_dir2_db_to_da(args->geo, fbno),
1773 &fbp);
1774 if (error)
1775 return error;
1776 if (!fbp)
1777 continue;
1778 free = fbp->b_addr;
1779 findex = 0;
1780 }
1781 /*
1782 * Look at the current free entry. Is it good enough?
1783 *
1784 * The bests initialisation should be where the bufer is read in
1785 * the above branch. But gcc is too stupid to realise that bests
1786 * and the freehdr are actually initialised if they are placed
1787 * there, so we have to do it here to avoid warnings. Blech.
1788 */
1789 bests = dp->d_ops->free_bests_p(free);
1790 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1791 if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
1792 be16_to_cpu(bests[findex]) >= length)
1793 dbno = freehdr.firstdb + findex;
1794 else {
1795 /*
1796 * Are we done with the freeblock?
1797 */
1798 if (++findex == freehdr.nvalid) {
1799 /*
1800 * Drop the block.
1801 */
1802 xfs_trans_brelse(tp, fbp);
1803 fbp = NULL;
1804 if (fblk && fblk->bp)
1805 fblk->bp = NULL;
1806 }
1807 }
1808 }
1809 /*
1810 * If we don't have a data block, we need to allocate one and make
1811 * the freespace entries refer to it.
1812 */
1813 if (unlikely(dbno == -1)) {
1814 /*
1815 * Not allowed to allocate, return failure.
1816 */
1817 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
1818 return -ENOSPC;
1819
1820 /*
1821 * Allocate and initialize the new data block.
1822 */
1823 if (unlikely((error = xfs_dir2_grow_inode(args,
1824 XFS_DIR2_DATA_SPACE,
1825 &dbno)) ||
1826 (error = xfs_dir3_data_init(args, dbno, &dbp))))
1827 return error;
1828
1829 /*
1830 * If (somehow) we have a freespace block, get rid of it.
1831 */
1832 if (fbp)
1833 xfs_trans_brelse(tp, fbp);
1834 if (fblk && fblk->bp)
1835 fblk->bp = NULL;
1836
1837 /*
1838 * Get the freespace block corresponding to the data block
1839 * that was just allocated.
1840 */
1841 fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
1842 error = xfs_dir2_free_try_read(tp, dp,
1843 xfs_dir2_db_to_da(args->geo, fbno),
1844 &fbp);
1845 if (error)
1846 return error;
1847
1848 /*
1849 * If there wasn't a freespace block, the read will
1850 * return a NULL fbp. Allocate and initialize a new one.
1851 */
1852 if (!fbp) {
1853 error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
1854 &fbno);
1855 if (error)
1856 return error;
1857
1858 if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
1859 xfs_alert(mp,
1860 "%s: dir ino %llu needed freesp block %lld for\n"
1861 " data block %lld, got %lld ifbno %llu lastfbno %d",
1862 __func__, (unsigned long long)dp->i_ino,
1863 (long long)dp->d_ops->db_to_fdb(
1864 args->geo, dbno),
1865 (long long)dbno, (long long)fbno,
1866 (unsigned long long)ifbno, lastfbno);
1867 if (fblk) {
1868 xfs_alert(mp,
1869 " fblk 0x%p blkno %llu index %d magic 0x%x",
1870 fblk,
1871 (unsigned long long)fblk->blkno,
1872 fblk->index,
1873 fblk->magic);
1874 } else {
1875 xfs_alert(mp, " ... fblk is NULL");
1876 }
1877 XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
1878 XFS_ERRLEVEL_LOW, mp);
1879 return -EFSCORRUPTED;
1880 }
1881
1882 /*
1883 * Get a buffer for the new block.
1884 */
1885 error = xfs_dir3_free_get_buf(args, fbno, &fbp);
1886 if (error)
1887 return error;
1888 free = fbp->b_addr;
1889 bests = dp->d_ops->free_bests_p(free);
1890 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1891
1892 /*
1893 * Remember the first slot as our empty slot.
1894 */
1895 freehdr.firstdb =
1896 (fbno - xfs_dir2_byte_to_db(args->geo,
1897 XFS_DIR2_FREE_OFFSET)) *
1898 dp->d_ops->free_max_bests(args->geo);
1899 } else {
1900 free = fbp->b_addr;
1901 bests = dp->d_ops->free_bests_p(free);
1902 dp->d_ops->free_hdr_from_disk(&freehdr, free);
1903 }
1904
1905 /*
1906 * Set the freespace block index from the data block number.
1907 */
1908 findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
1909 /*
1910 * If it's after the end of the current entries in the
1911 * freespace block, extend that table.
1912 */
1913 if (findex >= freehdr.nvalid) {
1914 ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
1915 freehdr.nvalid = findex + 1;
1916 /*
1917 * Tag new entry so nused will go up.
1918 */
1919 bests[findex] = cpu_to_be16(NULLDATAOFF);
1920 }
1921 /*
1922 * If this entry was for an empty data block
1923 * (this should always be true) then update the header.
1924 */
1925 if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
1926 freehdr.nused++;
1927 dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
1928 xfs_dir2_free_log_header(args, fbp);
1929 }
1930 /*
1931 * Update the real value in the table.
1932 * We haven't allocated the data entry yet so this will
1933 * change again.
1934 */
1935 hdr = dbp->b_addr;
1936 bf = dp->d_ops->data_bestfree_p(hdr);
1937 bests[findex] = bf[0].length;
1938 logfree = 1;
1939 }
1940 /*
1941 * We had a data block so we don't have to make a new one.
1942 */
1943 else {
1944 /*
1945 * If just checking, we succeeded.
1946 */
1947 if (args->op_flags & XFS_DA_OP_JUSTCHECK)
1948 return 0;
1949
1950 /*
1951 * Read the data block in.
1952 */
1953 error = xfs_dir3_data_read(tp, dp,
1954 xfs_dir2_db_to_da(args->geo, dbno),
1955 -1, &dbp);
1956 if (error)
1957 return error;
1958 hdr = dbp->b_addr;
1959 bf = dp->d_ops->data_bestfree_p(hdr);
1960 logfree = 0;
1961 }
1962 ASSERT(be16_to_cpu(bf[0].length) >= length);
1963 /*
1964 * Point to the existing unused space.
1965 */
1966 dup = (xfs_dir2_data_unused_t *)
1967 ((char *)hdr + be16_to_cpu(bf[0].offset));
1968 needscan = needlog = 0;
1969 /*
1970 * Mark the first part of the unused space, inuse for us.
1971 */
1972 xfs_dir2_data_use_free(args, dbp, dup,
1973 (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
1974 &needlog, &needscan);
1975 /*
1976 * Fill in the new entry and log it.
1977 */
1978 dep = (xfs_dir2_data_entry_t *)dup;
1979 dep->inumber = cpu_to_be64(args->inumber);
1980 dep->namelen = args->namelen;
1981 memcpy(dep->name, args->name, dep->namelen);
1982 dp->d_ops->data_put_ftype(dep, args->filetype);
1983 tagp = dp->d_ops->data_entry_tag_p(dep);
1984 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1985 xfs_dir2_data_log_entry(args, dbp, dep);
1986 /*
1987 * Rescan the block for bestfree if needed.
1988 */
1989 if (needscan)
1990 xfs_dir2_data_freescan(dp, hdr, &needlog);
1991 /*
1992 * Log the data block header if needed.
1993 */
1994 if (needlog)
1995 xfs_dir2_data_log_header(args, dbp);
1996 /*
1997 * If the freespace entry is now wrong, update it.
1998 */
1999 bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
2000 if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
2001 bests[findex] = bf[0].length;
2002 logfree = 1;
2003 }
2004 /*
2005 * Log the freespace entry if needed.
2006 */
2007 if (logfree)
2008 xfs_dir2_free_log_bests(args, fbp, findex, findex);
2009 /*
2010 * Return the data block and offset in args, then drop the data block.
2011 */
2012 args->blkno = (xfs_dablk_t)dbno;
2013 args->index = be16_to_cpu(*tagp);
2014 return 0;
2015}
2016
2017/*
2018 * Lookup an entry in a node-format directory.
2019 * All the real work happens in xfs_da3_node_lookup_int.
2020 * The only real output is the inode number of the entry.
2021 */
2022int /* error */
2023xfs_dir2_node_lookup(
2024 xfs_da_args_t *args) /* operation arguments */
2025{
2026 int error; /* error return value */
2027 int i; /* btree level */
2028 int rval; /* operation return value */
2029 xfs_da_state_t *state; /* btree cursor */
2030
2031 trace_xfs_dir2_node_lookup(args);
2032
2033 /*
2034 * Allocate and initialize the btree cursor.
2035 */
2036 state = xfs_da_state_alloc();
2037 state->args = args;
2038 state->mp = args->dp->i_mount;
2039 /*
2040 * Fill in the path to the entry in the cursor.
2041 */
2042 error = xfs_da3_node_lookup_int(state, &rval);
2043 if (error)
2044 rval = error;
2045 else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
2046 /* If a CI match, dup the actual name and return -EEXIST */
2047 xfs_dir2_data_entry_t *dep;
2048
2049 dep = (xfs_dir2_data_entry_t *)
2050 ((char *)state->extrablk.bp->b_addr +
2051 state->extrablk.index);
2052 rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
2053 }
2054 /*
2055 * Release the btree blocks and leaf block.
2056 */
2057 for (i = 0; i < state->path.active; i++) {
2058 xfs_trans_brelse(args->trans, state->path.blk[i].bp);
2059 state->path.blk[i].bp = NULL;
2060 }
2061 /*
2062 * Release the data block if we have it.
2063 */
2064 if (state->extravalid && state->extrablk.bp) {
2065 xfs_trans_brelse(args->trans, state->extrablk.bp);
2066 state->extrablk.bp = NULL;
2067 }
2068 xfs_da_state_free(state);
2069 return rval;
2070}
2071
2072/*
2073 * Remove an entry from a node-format directory.
2074 */
2075int /* error */
2076xfs_dir2_node_removename(
2077 struct xfs_da_args *args) /* operation arguments */
2078{
2079 struct xfs_da_state_blk *blk; /* leaf block */
2080 int error; /* error return value */
2081 int rval; /* operation return value */
2082 struct xfs_da_state *state; /* btree cursor */
2083
2084 trace_xfs_dir2_node_removename(args);
2085
2086 /*
2087 * Allocate and initialize the btree cursor.
2088 */
2089 state = xfs_da_state_alloc();
2090 state->args = args;
2091 state->mp = args->dp->i_mount;
2092
2093 /* Look up the entry we're deleting, set up the cursor. */
2094 error = xfs_da3_node_lookup_int(state, &rval);
2095 if (error)
2096 goto out_free;
2097
2098 /* Didn't find it, upper layer screwed up. */
2099 if (rval != -EEXIST) {
2100 error = rval;
2101 goto out_free;
2102 }
2103
2104 blk = &state->path.blk[state->path.active - 1];
2105 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
2106 ASSERT(state->extravalid);
2107 /*
2108 * Remove the leaf and data entries.
2109 * Extrablk refers to the data block.
2110 */
2111 error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
2112 &state->extrablk, &rval);
2113 if (error)
2114 goto out_free;
2115 /*
2116 * Fix the hash values up the btree.
2117 */
2118 xfs_da3_fixhashpath(state, &state->path);
2119 /*
2120 * If we need to join leaf blocks, do it.
2121 */
2122 if (rval && state->path.active > 1)
2123 error = xfs_da3_join(state);
2124 /*
2125 * If no errors so far, try conversion to leaf format.
2126 */
2127 if (!error)
2128 error = xfs_dir2_node_to_leaf(state);
2129out_free:
2130 xfs_da_state_free(state);
2131 return error;
2132}
2133
2134/*
2135 * Replace an entry's inode number in a node-format directory.
2136 */
2137int /* error */
2138xfs_dir2_node_replace(
2139 xfs_da_args_t *args) /* operation arguments */
2140{
2141 xfs_da_state_blk_t *blk; /* leaf block */
2142 xfs_dir2_data_hdr_t *hdr; /* data block header */
2143 xfs_dir2_data_entry_t *dep; /* data entry changed */
2144 int error; /* error return value */
2145 int i; /* btree level */
2146 xfs_ino_t inum; /* new inode number */
2147 xfs_dir2_leaf_t *leaf; /* leaf structure */
2148 xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */
2149 int rval; /* internal return value */
2150 xfs_da_state_t *state; /* btree cursor */
2151
2152 trace_xfs_dir2_node_replace(args);
2153
2154 /*
2155 * Allocate and initialize the btree cursor.
2156 */
2157 state = xfs_da_state_alloc();
2158 state->args = args;
2159 state->mp = args->dp->i_mount;
2160 inum = args->inumber;
2161 /*
2162 * Lookup the entry to change in the btree.
2163 */
2164 error = xfs_da3_node_lookup_int(state, &rval);
2165 if (error) {
2166 rval = error;
2167 }
2168 /*
2169 * It should be found, since the vnodeops layer has looked it up
2170 * and locked it. But paranoia is good.
2171 */
2172 if (rval == -EEXIST) {
2173 struct xfs_dir2_leaf_entry *ents;
2174 /*
2175 * Find the leaf entry.
2176 */
2177 blk = &state->path.blk[state->path.active - 1];
2178 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
2179 leaf = blk->bp->b_addr;
2180 ents = args->dp->d_ops->leaf_ents_p(leaf);
2181 lep = &ents[blk->index];
2182 ASSERT(state->extravalid);
2183 /*
2184 * Point to the data entry.
2185 */
2186 hdr = state->extrablk.bp->b_addr;
2187 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
2188 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
2189 dep = (xfs_dir2_data_entry_t *)
2190 ((char *)hdr +
2191 xfs_dir2_dataptr_to_off(args->geo,
2192 be32_to_cpu(lep->address)));
2193 ASSERT(inum != be64_to_cpu(dep->inumber));
2194 /*
2195 * Fill in the new inode number and log the entry.
2196 */
2197 dep->inumber = cpu_to_be64(inum);
2198 args->dp->d_ops->data_put_ftype(dep, args->filetype);
2199 xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
2200 rval = 0;
2201 }
2202 /*
2203 * Didn't find it, and we're holding a data block. Drop it.
2204 */
2205 else if (state->extravalid) {
2206 xfs_trans_brelse(args->trans, state->extrablk.bp);
2207 state->extrablk.bp = NULL;
2208 }
2209 /*
2210 * Release all the buffers in the cursor.
2211 */
2212 for (i = 0; i < state->path.active; i++) {
2213 xfs_trans_brelse(args->trans, state->path.blk[i].bp);
2214 state->path.blk[i].bp = NULL;
2215 }
2216 xfs_da_state_free(state);
2217 return rval;
2218}
2219
2220/*
2221 * Trim off a trailing empty freespace block.
2222 * Return (in rvalp) 1 if we did it, 0 if not.
2223 */
2224int /* error */
2225xfs_dir2_node_trim_free(
2226 xfs_da_args_t *args, /* operation arguments */
2227 xfs_fileoff_t fo, /* free block number */
2228 int *rvalp) /* out: did something */
2229{
2230 struct xfs_buf *bp; /* freespace buffer */
2231 xfs_inode_t *dp; /* incore directory inode */
2232 int error; /* error return code */
2233 xfs_dir2_free_t *free; /* freespace structure */
2234 xfs_mount_t *mp; /* filesystem mount point */
2235 xfs_trans_t *tp; /* transaction pointer */
2236 struct xfs_dir3_icfree_hdr freehdr;
2237
2238 dp = args->dp;
2239 mp = dp->i_mount;
2240 tp = args->trans;
2241 /*
2242 * Read the freespace block.
2243 */
2244 error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
2245 if (error)
2246 return error;
2247 /*
2248 * There can be holes in freespace. If fo is a hole, there's
2249 * nothing to do.
2250 */
2251 if (!bp)
2252 return 0;
2253 free = bp->b_addr;
2254 dp->d_ops->free_hdr_from_disk(&freehdr, free);
2255
2256 /*
2257 * If there are used entries, there's nothing to do.
2258 */
2259 if (freehdr.nused > 0) {
2260 xfs_trans_brelse(tp, bp);
2261 *rvalp = 0;
2262 return 0;
2263 }
2264 /*
2265 * Blow the block away.
2266 */
2267 error = xfs_dir2_shrink_inode(args,
2268 xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
2269 if (error) {
2270 /*
2271 * Can't fail with ENOSPC since that only happens with no
2272 * space reservation, when breaking up an extent into two
2273 * pieces. This is the last block of an extent.
2274 */
2275 ASSERT(error != -ENOSPC);
2276 xfs_trans_brelse(tp, bp);
2277 return error;
2278 }
2279 /*
2280 * Return that we succeeded.
2281 */
2282 *rvalp = 1;
2283 return 0;
2284}
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
new file mode 100644
index 000000000000..27ce0794d196
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -0,0 +1,274 @@
1/*
2 * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DIR2_PRIV_H__
19#define __XFS_DIR2_PRIV_H__
20
21struct dir_context;
22
23/*
24 * Directory offset/block conversion functions.
25 *
26 * DB blocks here are logical directory block numbers, not filesystem blocks.
27 */
28
29/*
30 * Convert dataptr to byte in file space
31 */
32static inline xfs_dir2_off_t
33xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
34{
35 return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
36}
37
38/*
39 * Convert byte in file space to dataptr. It had better be aligned.
40 */
41static inline xfs_dir2_dataptr_t
42xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
43{
44 return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
45}
46
47/*
48 * Convert byte in space to (DB) block
49 */
50static inline xfs_dir2_db_t
51xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
52{
53 return (xfs_dir2_db_t)(by >> geo->blklog);
54}
55
56/*
57 * Convert dataptr to a block number
58 */
59static inline xfs_dir2_db_t
60xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
61{
62 return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
63}
64
65/*
66 * Convert byte in space to offset in a block
67 */
68static inline xfs_dir2_data_aoff_t
69xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
70{
71 return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
72}
73
74/*
75 * Convert dataptr to a byte offset in a block
76 */
77static inline xfs_dir2_data_aoff_t
78xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
79{
80 return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
81}
82
83/*
84 * Convert block and offset to byte in space
85 */
86static inline xfs_dir2_off_t
87xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
88 xfs_dir2_data_aoff_t o)
89{
90 return ((xfs_dir2_off_t)db << geo->blklog) + o;
91}
92
93/*
94 * Convert block (DB) to block (dablk)
95 */
96static inline xfs_dablk_t
97xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
98{
99 return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
100}
101
102/*
103 * Convert byte in space to (DA) block
104 */
105static inline xfs_dablk_t
106xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
107{
108 return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
109}
110
111/*
112 * Convert block and offset to dataptr
113 */
114static inline xfs_dir2_dataptr_t
115xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
116 xfs_dir2_data_aoff_t o)
117{
118 return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
119}
120
121/*
122 * Convert block (dablk) to block (DB)
123 */
124static inline xfs_dir2_db_t
125xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
126{
127 return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
128}
129
130/*
131 * Convert block (dablk) to byte offset in space
132 */
133static inline xfs_dir2_off_t
134xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
135{
136 return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
137}
138
139/*
140 * Directory tail pointer accessor functions. Based on block geometry.
141 */
142static inline struct xfs_dir2_block_tail *
143xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
144{
145 return ((struct xfs_dir2_block_tail *)
146 ((char *)hdr + geo->blksize)) - 1;
147}
148
149static inline struct xfs_dir2_leaf_tail *
150xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
151{
152 return (struct xfs_dir2_leaf_tail *)
153 ((char *)lp + geo->blksize -
154 sizeof(struct xfs_dir2_leaf_tail));
155}
156
157/* xfs_dir2.c */
158extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
159extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
160 xfs_dir2_db_t *dbp);
161extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
162 const unsigned char *name, int len);
163
164#define S_SHIFT 12
165extern const unsigned char xfs_mode_to_ftype[];
166
167extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
168 __uint8_t filetype);
169
170
171/* xfs_dir2_block.c */
172extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
173 struct xfs_buf **bpp);
174extern int xfs_dir2_block_addname(struct xfs_da_args *args);
175extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
176extern int xfs_dir2_block_removename(struct xfs_da_args *args);
177extern int xfs_dir2_block_replace(struct xfs_da_args *args);
178extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
179 struct xfs_buf *lbp, struct xfs_buf *dbp);
180
181/* xfs_dir2_data.c */
182#ifdef DEBUG
183#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
184#else
185#define xfs_dir3_data_check(dp,bp)
186#endif
187
188extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
189extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
190 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
191extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
192 xfs_daddr_t mapped_bno);
193
194extern struct xfs_dir2_data_free *
195xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
196 struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
197 int *loghead);
198extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
199 struct xfs_buf **bpp);
200
201/* xfs_dir2_leaf.c */
202extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
203 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
204extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
205 struct xfs_buf *dbp);
206extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
207extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
208 struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
209extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
210 struct xfs_dir2_leaf_entry *ents, int *indexp,
211 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
212extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
213 struct xfs_buf **bpp, __uint16_t magic);
214extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
215 struct xfs_buf *bp, int first, int last);
216extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
217 struct xfs_buf *bp);
218extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
219extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
220extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
221extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
222 struct xfs_buf *lbp);
223extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
224 struct xfs_buf *lbp, xfs_dir2_db_t db);
225extern struct xfs_dir2_leaf_entry *
226xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
227 struct xfs_dir2_leaf_entry *ents, int index, int compact,
228 int lowstale, int highstale, int *lfloglow, int *lfloghigh);
229extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
230
231extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
232 struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
233
234/* xfs_dir2_node.c */
235extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
236 struct xfs_buf *lbp);
237extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
238 struct xfs_buf *bp, int *count);
239extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
240 struct xfs_da_args *args, int *indexp,
241 struct xfs_da_state *state);
242extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
243 struct xfs_buf *leaf2_bp);
244extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
245 struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
246extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
247extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
248 struct xfs_da_state_blk *drop_blk,
249 struct xfs_da_state_blk *save_blk);
250extern int xfs_dir2_node_addname(struct xfs_da_args *args);
251extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
252extern int xfs_dir2_node_removename(struct xfs_da_args *args);
253extern int xfs_dir2_node_replace(struct xfs_da_args *args);
254extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
255 int *rvalp);
256extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
257 xfs_dablk_t fbno, struct xfs_buf **bpp);
258
259/* xfs_dir2_sf.c */
260extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
261 struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
262extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
263 int size, xfs_dir2_sf_hdr_t *sfhp);
264extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
265extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
266extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
267extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
268extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
269
270/* xfs_dir2_readdir.c */
271extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
272 size_t bufsize);
273
274#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
new file mode 100644
index 000000000000..8f4f26af35e1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -0,0 +1,1184 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_format.h"
21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_da_format.h"
27#include "xfs_da_btree.h"
28#include "xfs_inode.h"
29#include "xfs_trans.h"
30#include "xfs_inode_item.h"
31#include "xfs_error.h"
32#include "xfs_dir2.h"
33#include "xfs_dir2_priv.h"
34#include "xfs_trace.h"
35#include "xfs_dinode.h"
36
37/*
38 * Prototypes for internal functions.
39 */
40static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
41 xfs_dir2_sf_entry_t *sfep,
42 xfs_dir2_data_aoff_t offset,
43 int new_isize);
44static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
45 int new_isize);
46static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
47 xfs_dir2_sf_entry_t **sfepp,
48 xfs_dir2_data_aoff_t *offsetp);
49#ifdef DEBUG
50static void xfs_dir2_sf_check(xfs_da_args_t *args);
51#else
52#define xfs_dir2_sf_check(args)
53#endif /* DEBUG */
54#if XFS_BIG_INUMS
55static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
56static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
57#endif /* XFS_BIG_INUMS */
58
59/*
60 * Given a block directory (dp/block), calculate its size as a shortform (sf)
61 * directory and a header for the sf directory, if it will fit it the
62 * space currently present in the inode. If it won't fit, the output
63 * size is too big (but not accurate).
64 */
65int /* size for sf form */
66xfs_dir2_block_sfsize(
67 xfs_inode_t *dp, /* incore inode pointer */
68 xfs_dir2_data_hdr_t *hdr, /* block directory data */
69 xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */
70{
71 xfs_dir2_dataptr_t addr; /* data entry address */
72 xfs_dir2_leaf_entry_t *blp; /* leaf area of the block */
73 xfs_dir2_block_tail_t *btp; /* tail area of the block */
74 int count; /* shortform entry count */
75 xfs_dir2_data_entry_t *dep; /* data entry in the block */
76 int i; /* block entry index */
77 int i8count; /* count of big-inode entries */
78 int isdot; /* entry is "." */
79 int isdotdot; /* entry is ".." */
80 xfs_mount_t *mp; /* mount structure pointer */
81 int namelen; /* total name bytes */
82 xfs_ino_t parent = 0; /* parent inode number */
83 int size=0; /* total computed size */
84 int has_ftype;
85 struct xfs_da_geometry *geo;
86
87 mp = dp->i_mount;
88 geo = mp->m_dir_geo;
89
90 /*
91 * if there is a filetype field, add the extra byte to the namelen
92 * for each entry that we see.
93 */
94 has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
95
96 count = i8count = namelen = 0;
97 btp = xfs_dir2_block_tail_p(geo, hdr);
98 blp = xfs_dir2_block_leaf_p(btp);
99
100 /*
101 * Iterate over the block's data entries by using the leaf pointers.
102 */
103 for (i = 0; i < be32_to_cpu(btp->count); i++) {
104 if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
105 continue;
106 /*
107 * Calculate the pointer to the entry at hand.
108 */
109 dep = (xfs_dir2_data_entry_t *)((char *)hdr +
110 xfs_dir2_dataptr_to_off(geo, addr));
111 /*
112 * Detect . and .., so we can special-case them.
113 * . is not included in sf directories.
114 * .. is included by just the parent inode number.
115 */
116 isdot = dep->namelen == 1 && dep->name[0] == '.';
117 isdotdot =
118 dep->namelen == 2 &&
119 dep->name[0] == '.' && dep->name[1] == '.';
120#if XFS_BIG_INUMS
121 if (!isdot)
122 i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
123#endif
124 /* take into account the file type field */
125 if (!isdot && !isdotdot) {
126 count++;
127 namelen += dep->namelen + has_ftype;
128 } else if (isdotdot)
129 parent = be64_to_cpu(dep->inumber);
130 /*
131 * Calculate the new size, see if we should give up yet.
132 */
133 size = xfs_dir2_sf_hdr_size(i8count) + /* header */
134 count + /* namelen */
135 count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
136 namelen + /* name */
137 (i8count ? /* inumber */
138 (uint)sizeof(xfs_dir2_ino8_t) * count :
139 (uint)sizeof(xfs_dir2_ino4_t) * count);
140 if (size > XFS_IFORK_DSIZE(dp))
141 return size; /* size value is a failure */
142 }
143 /*
144 * Create the output header, if it worked.
145 */
146 sfhp->count = count;
147 sfhp->i8count = i8count;
148 dp->d_ops->sf_put_parent_ino(sfhp, parent);
149 return size;
150}
151
152/*
153 * Convert a block format directory to shortform.
154 * Caller has already checked that it will fit, and built us a header.
155 */
156int /* error */
157xfs_dir2_block_to_sf(
158 xfs_da_args_t *args, /* operation arguments */
159 struct xfs_buf *bp,
160 int size, /* shortform directory size */
161 xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */
162{
163 xfs_dir2_data_hdr_t *hdr; /* block header */
164 xfs_dir2_block_tail_t *btp; /* block tail pointer */
165 xfs_dir2_data_entry_t *dep; /* data entry pointer */
166 xfs_inode_t *dp; /* incore directory inode */
167 xfs_dir2_data_unused_t *dup; /* unused data pointer */
168 char *endptr; /* end of data entries */
169 int error; /* error return value */
170 int logflags; /* inode logging flags */
171 xfs_mount_t *mp; /* filesystem mount point */
172 char *ptr; /* current data pointer */
173 xfs_dir2_sf_entry_t *sfep; /* shortform entry */
174 xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */
175 xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */
176
177 trace_xfs_dir2_block_to_sf(args);
178
179 dp = args->dp;
180 mp = dp->i_mount;
181
182 /*
183 * allocate a temporary destination buffer the size of the inode
184 * to format the data into. Once we have formatted the data, we
185 * can free the block and copy the formatted data into the inode literal
186 * area.
187 */
188 dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
189 hdr = bp->b_addr;
190
191 /*
192 * Copy the header into the newly allocate local space.
193 */
194 sfp = (xfs_dir2_sf_hdr_t *)dst;
195 memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
196
197 /*
198 * Set up to loop over the block's entries.
199 */
200 btp = xfs_dir2_block_tail_p(args->geo, hdr);
201 ptr = (char *)dp->d_ops->data_entry_p(hdr);
202 endptr = (char *)xfs_dir2_block_leaf_p(btp);
203 sfep = xfs_dir2_sf_firstentry(sfp);
204 /*
205 * Loop over the active and unused entries.
206 * Stop when we reach the leaf/tail portion of the block.
207 */
208 while (ptr < endptr) {
209 /*
210 * If it's unused, just skip over it.
211 */
212 dup = (xfs_dir2_data_unused_t *)ptr;
213 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
214 ptr += be16_to_cpu(dup->length);
215 continue;
216 }
217 dep = (xfs_dir2_data_entry_t *)ptr;
218 /*
219 * Skip .
220 */
221 if (dep->namelen == 1 && dep->name[0] == '.')
222 ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
223 /*
224 * Skip .., but make sure the inode number is right.
225 */
226 else if (dep->namelen == 2 &&
227 dep->name[0] == '.' && dep->name[1] == '.')
228 ASSERT(be64_to_cpu(dep->inumber) ==
229 dp->d_ops->sf_get_parent_ino(sfp));
230 /*
231 * Normal entry, copy it into shortform.
232 */
233 else {
234 sfep->namelen = dep->namelen;
235 xfs_dir2_sf_put_offset(sfep,
236 (xfs_dir2_data_aoff_t)
237 ((char *)dep - (char *)hdr));
238 memcpy(sfep->name, dep->name, dep->namelen);
239 dp->d_ops->sf_put_ino(sfp, sfep,
240 be64_to_cpu(dep->inumber));
241 dp->d_ops->sf_put_ftype(sfep,
242 dp->d_ops->data_get_ftype(dep));
243
244 sfep = dp->d_ops->sf_nextentry(sfp, sfep);
245 }
246 ptr += dp->d_ops->data_entsize(dep->namelen);
247 }
248 ASSERT((char *)sfep - (char *)sfp == size);
249
250 /* now we are done with the block, we can shrink the inode */
251 logflags = XFS_ILOG_CORE;
252 error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
253 if (error) {
254 ASSERT(error != -ENOSPC);
255 goto out;
256 }
257
258 /*
259 * The buffer is now unconditionally gone, whether
260 * xfs_dir2_shrink_inode worked or not.
261 *
262 * Convert the inode to local format and copy the data in.
263 */
264 dp->i_df.if_flags &= ~XFS_IFEXTENTS;
265 dp->i_df.if_flags |= XFS_IFINLINE;
266 dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
267 ASSERT(dp->i_df.if_bytes == 0);
268 xfs_idata_realloc(dp, size, XFS_DATA_FORK);
269
270 logflags |= XFS_ILOG_DDATA;
271 memcpy(dp->i_df.if_u1.if_data, dst, size);
272 dp->i_d.di_size = size;
273 xfs_dir2_sf_check(args);
274out:
275 xfs_trans_log_inode(args->trans, dp, logflags);
276 kmem_free(dst);
277 return error;
278}
279
280/*
281 * Add a name to a shortform directory.
282 * There are two algorithms, "easy" and "hard" which we decide on
283 * before changing anything.
284 * Convert to block form if necessary, if the new entry won't fit.
285 */
286int /* error */
287xfs_dir2_sf_addname(
288 xfs_da_args_t *args) /* operation arguments */
289{
290 xfs_inode_t *dp; /* incore directory inode */
291 int error; /* error return value */
292 int incr_isize; /* total change in size */
293 int new_isize; /* di_size after adding name */
294 int objchange; /* changing to 8-byte inodes */
295 xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
296 int pick; /* which algorithm to use */
297 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
298 xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
299
300 trace_xfs_dir2_sf_addname(args);
301
302 ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
303 dp = args->dp;
304 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
305 /*
306 * Make sure the shortform value has some of its header.
307 */
308 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
309 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
310 return -EIO;
311 }
312 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
313 ASSERT(dp->i_df.if_u1.if_data != NULL);
314 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
315 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
316 /*
317 * Compute entry (and change in) size.
318 */
319 incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
320 objchange = 0;
321#if XFS_BIG_INUMS
322 /*
323 * Do we have to change to 8 byte inodes?
324 */
325 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
326 /*
327 * Yes, adjust the inode size. old count + (parent + new)
328 */
329 incr_isize +=
330 (sfp->count + 2) *
331 ((uint)sizeof(xfs_dir2_ino8_t) -
332 (uint)sizeof(xfs_dir2_ino4_t));
333 objchange = 1;
334 }
335#endif
336 new_isize = (int)dp->i_d.di_size + incr_isize;
337 /*
338 * Won't fit as shortform any more (due to size),
339 * or the pick routine says it won't (due to offset values).
340 */
341 if (new_isize > XFS_IFORK_DSIZE(dp) ||
342 (pick =
343 xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
344 /*
345 * Just checking or no space reservation, it doesn't fit.
346 */
347 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
348 return -ENOSPC;
349 /*
350 * Convert to block form then add the name.
351 */
352 error = xfs_dir2_sf_to_block(args);
353 if (error)
354 return error;
355 return xfs_dir2_block_addname(args);
356 }
357 /*
358 * Just checking, it fits.
359 */
360 if (args->op_flags & XFS_DA_OP_JUSTCHECK)
361 return 0;
362 /*
363 * Do it the easy way - just add it at the end.
364 */
365 if (pick == 1)
366 xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
367 /*
368 * Do it the hard way - look for a place to insert the new entry.
369 * Convert to 8 byte inode numbers first if necessary.
370 */
371 else {
372 ASSERT(pick == 2);
373#if XFS_BIG_INUMS
374 if (objchange)
375 xfs_dir2_sf_toino8(args);
376#endif
377 xfs_dir2_sf_addname_hard(args, objchange, new_isize);
378 }
379 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
380 return 0;
381}
382
383/*
384 * Add the new entry the "easy" way.
385 * This is copying the old directory and adding the new entry at the end.
386 * Since it's sorted by "offset" we need room after the last offset
387 * that's already there, and then room to convert to a block directory.
388 * This is already checked by the pick routine.
389 */
390static void
391xfs_dir2_sf_addname_easy(
392 xfs_da_args_t *args, /* operation arguments */
393 xfs_dir2_sf_entry_t *sfep, /* pointer to new entry */
394 xfs_dir2_data_aoff_t offset, /* offset to use for new ent */
395 int new_isize) /* new directory size */
396{
397 int byteoff; /* byte offset in sf dir */
398 xfs_inode_t *dp; /* incore directory inode */
399 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
400
401 dp = args->dp;
402
403 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
404 byteoff = (int)((char *)sfep - (char *)sfp);
405 /*
406 * Grow the in-inode space.
407 */
408 xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
409 XFS_DATA_FORK);
410 /*
411 * Need to set up again due to realloc of the inode data.
412 */
413 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
414 sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
415 /*
416 * Fill in the new entry.
417 */
418 sfep->namelen = args->namelen;
419 xfs_dir2_sf_put_offset(sfep, offset);
420 memcpy(sfep->name, args->name, sfep->namelen);
421 dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
422 dp->d_ops->sf_put_ftype(sfep, args->filetype);
423
424 /*
425 * Update the header and inode.
426 */
427 sfp->count++;
428#if XFS_BIG_INUMS
429 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
430 sfp->i8count++;
431#endif
432 dp->i_d.di_size = new_isize;
433 xfs_dir2_sf_check(args);
434}
435
436/*
437 * Add the new entry the "hard" way.
438 * The caller has already converted to 8 byte inode numbers if necessary,
439 * in which case we need to leave the i8count at 1.
440 * Find a hole that the new entry will fit into, and copy
441 * the first part of the entries, the new entry, and the last part of
442 * the entries.
443 */
444/* ARGSUSED */
445static void
446xfs_dir2_sf_addname_hard(
447 xfs_da_args_t *args, /* operation arguments */
448 int objchange, /* changing inode number size */
449 int new_isize) /* new directory size */
450{
451 int add_datasize; /* data size need for new ent */
452 char *buf; /* buffer for old */
453 xfs_inode_t *dp; /* incore directory inode */
454 int eof; /* reached end of old dir */
455 int nbytes; /* temp for byte copies */
456 xfs_dir2_data_aoff_t new_offset; /* next offset value */
457 xfs_dir2_data_aoff_t offset; /* current offset value */
458 int old_isize; /* previous di_size */
459 xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */
460 xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */
461 xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
462 xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */
463 struct xfs_mount *mp;
464
465 /*
466 * Copy the old directory to the stack buffer.
467 */
468 dp = args->dp;
469 mp = dp->i_mount;
470
471 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
472 old_isize = (int)dp->i_d.di_size;
473 buf = kmem_alloc(old_isize, KM_SLEEP);
474 oldsfp = (xfs_dir2_sf_hdr_t *)buf;
475 memcpy(oldsfp, sfp, old_isize);
476 /*
477 * Loop over the old directory finding the place we're going
478 * to insert the new entry.
479 * If it's going to end up at the end then oldsfep will point there.
480 */
481 for (offset = dp->d_ops->data_first_offset,
482 oldsfep = xfs_dir2_sf_firstentry(oldsfp),
483 add_datasize = dp->d_ops->data_entsize(args->namelen),
484 eof = (char *)oldsfep == &buf[old_isize];
485 !eof;
486 offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
487 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
488 eof = (char *)oldsfep == &buf[old_isize]) {
489 new_offset = xfs_dir2_sf_get_offset(oldsfep);
490 if (offset + add_datasize <= new_offset)
491 break;
492 }
493 /*
494 * Get rid of the old directory, then allocate space for
495 * the new one. We do this so xfs_idata_realloc won't copy
496 * the data.
497 */
498 xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
499 xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
500 /*
501 * Reset the pointer since the buffer was reallocated.
502 */
503 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
504 /*
505 * Copy the first part of the directory, including the header.
506 */
507 nbytes = (int)((char *)oldsfep - (char *)oldsfp);
508 memcpy(sfp, oldsfp, nbytes);
509 sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
510 /*
511 * Fill in the new entry, and update the header counts.
512 */
513 sfep->namelen = args->namelen;
514 xfs_dir2_sf_put_offset(sfep, offset);
515 memcpy(sfep->name, args->name, sfep->namelen);
516 dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
517 dp->d_ops->sf_put_ftype(sfep, args->filetype);
518 sfp->count++;
519#if XFS_BIG_INUMS
520 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
521 sfp->i8count++;
522#endif
523 /*
524 * If there's more left to copy, do that.
525 */
526 if (!eof) {
527 sfep = dp->d_ops->sf_nextentry(sfp, sfep);
528 memcpy(sfep, oldsfep, old_isize - nbytes);
529 }
530 kmem_free(buf);
531 dp->i_d.di_size = new_isize;
532 xfs_dir2_sf_check(args);
533}
534
535/*
536 * Decide if the new entry will fit at all.
537 * If it will fit, pick between adding the new entry to the end (easy)
538 * or somewhere else (hard).
539 * Return 0 (won't fit), 1 (easy), 2 (hard).
540 */
541/*ARGSUSED*/
542static int /* pick result */
543xfs_dir2_sf_addname_pick(
544 xfs_da_args_t *args, /* operation arguments */
545 int objchange, /* inode # size changes */
546 xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */
547 xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */
548{
549 xfs_inode_t *dp; /* incore directory inode */
550 int holefit; /* found hole it will fit in */
551 int i; /* entry number */
552 xfs_mount_t *mp; /* filesystem mount point */
553 xfs_dir2_data_aoff_t offset; /* data block offset */
554 xfs_dir2_sf_entry_t *sfep; /* shortform entry */
555 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
556 int size; /* entry's data size */
557 int used; /* data bytes used */
558
559 dp = args->dp;
560 mp = dp->i_mount;
561
562 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
563 size = dp->d_ops->data_entsize(args->namelen);
564 offset = dp->d_ops->data_first_offset;
565 sfep = xfs_dir2_sf_firstentry(sfp);
566 holefit = 0;
567 /*
568 * Loop over sf entries.
569 * Keep track of data offset and whether we've seen a place
570 * to insert the new entry.
571 */
572 for (i = 0; i < sfp->count; i++) {
573 if (!holefit)
574 holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
575 offset = xfs_dir2_sf_get_offset(sfep) +
576 dp->d_ops->data_entsize(sfep->namelen);
577 sfep = dp->d_ops->sf_nextentry(sfp, sfep);
578 }
579 /*
580 * Calculate data bytes used excluding the new entry, if this
581 * was a data block (block form directory).
582 */
583 used = offset +
584 (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
585 (uint)sizeof(xfs_dir2_block_tail_t);
586 /*
587 * If it won't fit in a block form then we can't insert it,
588 * we'll go back, convert to block, then try the insert and convert
589 * to leaf.
590 */
591 if (used + (holefit ? 0 : size) > args->geo->blksize)
592 return 0;
593 /*
594 * If changing the inode number size, do it the hard way.
595 */
596#if XFS_BIG_INUMS
597 if (objchange) {
598 return 2;
599 }
600#else
601 ASSERT(objchange == 0);
602#endif
603 /*
604 * If it won't fit at the end then do it the hard way (use the hole).
605 */
606 if (used + size > args->geo->blksize)
607 return 2;
608 /*
609 * Do it the easy way.
610 */
611 *sfepp = sfep;
612 *offsetp = offset;
613 return 1;
614}
615
616#ifdef DEBUG
617/*
618 * Check consistency of shortform directory, assert if bad.
619 */
620static void
621xfs_dir2_sf_check(
622 xfs_da_args_t *args) /* operation arguments */
623{
624 xfs_inode_t *dp; /* incore directory inode */
625 int i; /* entry number */
626 int i8count; /* number of big inode#s */
627 xfs_ino_t ino; /* entry inode number */
628 int offset; /* data offset */
629 xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
630 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
631 struct xfs_mount *mp;
632
633 dp = args->dp;
634 mp = dp->i_mount;
635
636 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
637 offset = dp->d_ops->data_first_offset;
638 ino = dp->d_ops->sf_get_parent_ino(sfp);
639 i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
640
641 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
642 i < sfp->count;
643 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
644 ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
645 ino = dp->d_ops->sf_get_ino(sfp, sfep);
646 i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
647 offset =
648 xfs_dir2_sf_get_offset(sfep) +
649 dp->d_ops->data_entsize(sfep->namelen);
650 ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
651 }
652 ASSERT(i8count == sfp->i8count);
653 ASSERT(XFS_BIG_INUMS || i8count == 0);
654 ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
655 ASSERT(offset +
656 (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
657 (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
658}
659#endif /* DEBUG */
660
661/*
662 * Create a new (shortform) directory.
663 */
664int /* error, always 0 */
665xfs_dir2_sf_create(
666 xfs_da_args_t *args, /* operation arguments */
667 xfs_ino_t pino) /* parent inode number */
668{
669 xfs_inode_t *dp; /* incore directory inode */
670 int i8count; /* parent inode is an 8-byte number */
671 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
672 int size; /* directory size */
673
674 trace_xfs_dir2_sf_create(args);
675
676 dp = args->dp;
677
678 ASSERT(dp != NULL);
679 ASSERT(dp->i_d.di_size == 0);
680 /*
681 * If it's currently a zero-length extent file,
682 * convert it to local format.
683 */
684 if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
685 dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */
686 dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
687 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
688 dp->i_df.if_flags |= XFS_IFINLINE;
689 }
690 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
691 ASSERT(dp->i_df.if_bytes == 0);
692 i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
693 size = xfs_dir2_sf_hdr_size(i8count);
694 /*
695 * Make a buffer for the data.
696 */
697 xfs_idata_realloc(dp, size, XFS_DATA_FORK);
698 /*
699 * Fill in the header,
700 */
701 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
702 sfp->i8count = i8count;
703 /*
704 * Now can put in the inode number, since i8count is set.
705 */
706 dp->d_ops->sf_put_parent_ino(sfp, pino);
707 sfp->count = 0;
708 dp->i_d.di_size = size;
709 xfs_dir2_sf_check(args);
710 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
711 return 0;
712}
713
714/*
715 * Lookup an entry in a shortform directory.
716 * Returns EEXIST if found, ENOENT if not found.
717 */
718int /* error */
719xfs_dir2_sf_lookup(
720 xfs_da_args_t *args) /* operation arguments */
721{
722 xfs_inode_t *dp; /* incore directory inode */
723 int i; /* entry index */
724 int error;
725 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
726 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
727 enum xfs_dacmp cmp; /* comparison result */
728 xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */
729
730 trace_xfs_dir2_sf_lookup(args);
731
732 xfs_dir2_sf_check(args);
733 dp = args->dp;
734
735 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
736 /*
737 * Bail out if the directory is way too short.
738 */
739 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
740 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
741 return -EIO;
742 }
743 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
744 ASSERT(dp->i_df.if_u1.if_data != NULL);
745 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
746 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
747 /*
748 * Special case for .
749 */
750 if (args->namelen == 1 && args->name[0] == '.') {
751 args->inumber = dp->i_ino;
752 args->cmpresult = XFS_CMP_EXACT;
753 args->filetype = XFS_DIR3_FT_DIR;
754 return -EEXIST;
755 }
756 /*
757 * Special case for ..
758 */
759 if (args->namelen == 2 &&
760 args->name[0] == '.' && args->name[1] == '.') {
761 args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
762 args->cmpresult = XFS_CMP_EXACT;
763 args->filetype = XFS_DIR3_FT_DIR;
764 return -EEXIST;
765 }
766 /*
767 * Loop over all the entries trying to match ours.
768 */
769 ci_sfep = NULL;
770 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
771 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
772 /*
773 * Compare name and if it's an exact match, return the inode
774 * number. If it's the first case-insensitive match, store the
775 * inode number and continue looking for an exact match.
776 */
777 cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
778 sfep->namelen);
779 if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
780 args->cmpresult = cmp;
781 args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
782 args->filetype = dp->d_ops->sf_get_ftype(sfep);
783 if (cmp == XFS_CMP_EXACT)
784 return -EEXIST;
785 ci_sfep = sfep;
786 }
787 }
788 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
789 /*
790 * Here, we can only be doing a lookup (not a rename or replace).
791 * If a case-insensitive match was not found, return -ENOENT.
792 */
793 if (!ci_sfep)
794 return -ENOENT;
795 /* otherwise process the CI match as required by the caller */
796 error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
797 return error;
798}
799
800/*
801 * Remove an entry from a shortform directory.
802 */
803int /* error */
804xfs_dir2_sf_removename(
805 xfs_da_args_t *args)
806{
807 int byteoff; /* offset of removed entry */
808 xfs_inode_t *dp; /* incore directory inode */
809 int entsize; /* this entry's size */
810 int i; /* shortform entry index */
811 int newsize; /* new inode size */
812 int oldsize; /* old inode size */
813 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
814 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
815
816 trace_xfs_dir2_sf_removename(args);
817
818 dp = args->dp;
819
820 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
821 oldsize = (int)dp->i_d.di_size;
822 /*
823 * Bail out if the directory is way too short.
824 */
825 if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
826 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
827 return -EIO;
828 }
829 ASSERT(dp->i_df.if_bytes == oldsize);
830 ASSERT(dp->i_df.if_u1.if_data != NULL);
831 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
832 ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
833 /*
834 * Loop over the old directory entries.
835 * Find the one we're deleting.
836 */
837 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
838 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
839 if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
840 XFS_CMP_EXACT) {
841 ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
842 args->inumber);
843 break;
844 }
845 }
846 /*
847 * Didn't find it.
848 */
849 if (i == sfp->count)
850 return -ENOENT;
851 /*
852 * Calculate sizes.
853 */
854 byteoff = (int)((char *)sfep - (char *)sfp);
855 entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
856 newsize = oldsize - entsize;
857 /*
858 * Copy the part if any after the removed entry, sliding it down.
859 */
860 if (byteoff + entsize < oldsize)
861 memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
862 oldsize - (byteoff + entsize));
863 /*
864 * Fix up the header and file size.
865 */
866 sfp->count--;
867 dp->i_d.di_size = newsize;
868 /*
869 * Reallocate, making it smaller.
870 */
871 xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
872 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
873#if XFS_BIG_INUMS
874 /*
875 * Are we changing inode number size?
876 */
877 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
878 if (sfp->i8count == 1)
879 xfs_dir2_sf_toino4(args);
880 else
881 sfp->i8count--;
882 }
883#endif
884 xfs_dir2_sf_check(args);
885 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
886 return 0;
887}
888
889/*
890 * Replace the inode number of an entry in a shortform directory.
891 */
892int /* error */
893xfs_dir2_sf_replace(
894 xfs_da_args_t *args) /* operation arguments */
895{
896 xfs_inode_t *dp; /* incore directory inode */
897 int i; /* entry index */
898#if XFS_BIG_INUMS || defined(DEBUG)
899 xfs_ino_t ino=0; /* entry old inode number */
900#endif
901#if XFS_BIG_INUMS
902 int i8elevated; /* sf_toino8 set i8count=1 */
903#endif
904 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
905 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
906
907 trace_xfs_dir2_sf_replace(args);
908
909 dp = args->dp;
910
911 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
912 /*
913 * Bail out if the shortform directory is way too small.
914 */
915 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
916 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
917 return -EIO;
918 }
919 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
920 ASSERT(dp->i_df.if_u1.if_data != NULL);
921 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
922 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
923#if XFS_BIG_INUMS
924 /*
925 * New inode number is large, and need to convert to 8-byte inodes.
926 */
927 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
928 int error; /* error return value */
929 int newsize; /* new inode size */
930
931 newsize =
932 dp->i_df.if_bytes +
933 (sfp->count + 1) *
934 ((uint)sizeof(xfs_dir2_ino8_t) -
935 (uint)sizeof(xfs_dir2_ino4_t));
936 /*
937 * Won't fit as shortform, convert to block then do replace.
938 */
939 if (newsize > XFS_IFORK_DSIZE(dp)) {
940 error = xfs_dir2_sf_to_block(args);
941 if (error) {
942 return error;
943 }
944 return xfs_dir2_block_replace(args);
945 }
946 /*
947 * Still fits, convert to 8-byte now.
948 */
949 xfs_dir2_sf_toino8(args);
950 i8elevated = 1;
951 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
952 } else
953 i8elevated = 0;
954#endif
955 ASSERT(args->namelen != 1 || args->name[0] != '.');
956 /*
957 * Replace ..'s entry.
958 */
959 if (args->namelen == 2 &&
960 args->name[0] == '.' && args->name[1] == '.') {
961#if XFS_BIG_INUMS || defined(DEBUG)
962 ino = dp->d_ops->sf_get_parent_ino(sfp);
963 ASSERT(args->inumber != ino);
964#endif
965 dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
966 }
967 /*
968 * Normal entry, look for the name.
969 */
970 else {
971 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
972 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
973 if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
974 XFS_CMP_EXACT) {
975#if XFS_BIG_INUMS || defined(DEBUG)
976 ino = dp->d_ops->sf_get_ino(sfp, sfep);
977 ASSERT(args->inumber != ino);
978#endif
979 dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
980 dp->d_ops->sf_put_ftype(sfep, args->filetype);
981 break;
982 }
983 }
984 /*
985 * Didn't find it.
986 */
987 if (i == sfp->count) {
988 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
989#if XFS_BIG_INUMS
990 if (i8elevated)
991 xfs_dir2_sf_toino4(args);
992#endif
993 return -ENOENT;
994 }
995 }
996#if XFS_BIG_INUMS
997 /*
998 * See if the old number was large, the new number is small.
999 */
1000 if (ino > XFS_DIR2_MAX_SHORT_INUM &&
1001 args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
1002 /*
1003 * And the old count was one, so need to convert to small.
1004 */
1005 if (sfp->i8count == 1)
1006 xfs_dir2_sf_toino4(args);
1007 else
1008 sfp->i8count--;
1009 }
1010 /*
1011 * See if the old number was small, the new number is large.
1012 */
1013 if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
1014 args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
1015 /*
1016 * add to the i8count unless we just converted to 8-byte
1017 * inodes (which does an implied i8count = 1)
1018 */
1019 ASSERT(sfp->i8count != 0);
1020 if (!i8elevated)
1021 sfp->i8count++;
1022 }
1023#endif
1024 xfs_dir2_sf_check(args);
1025 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
1026 return 0;
1027}
1028
1029#if XFS_BIG_INUMS
1030/*
1031 * Convert from 8-byte inode numbers to 4-byte inode numbers.
1032 * The last 8-byte inode number is gone, but the count is still 1.
1033 */
1034static void
1035xfs_dir2_sf_toino4(
1036 xfs_da_args_t *args) /* operation arguments */
1037{
1038 char *buf; /* old dir's buffer */
1039 xfs_inode_t *dp; /* incore directory inode */
1040 int i; /* entry index */
1041 int newsize; /* new inode size */
1042 xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
1043 xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
1044 int oldsize; /* old inode size */
1045 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1046 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
1047 struct xfs_mount *mp;
1048
1049 trace_xfs_dir2_sf_toino4(args);
1050
1051 dp = args->dp;
1052 mp = dp->i_mount;
1053
1054 /*
1055 * Copy the old directory to the buffer.
1056 * Then nuke it from the inode, and add the new buffer to the inode.
1057 * Don't want xfs_idata_realloc copying the data here.
1058 */
1059 oldsize = dp->i_df.if_bytes;
1060 buf = kmem_alloc(oldsize, KM_SLEEP);
1061 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
1062 ASSERT(oldsfp->i8count == 1);
1063 memcpy(buf, oldsfp, oldsize);
1064 /*
1065 * Compute the new inode size.
1066 */
1067 newsize =
1068 oldsize -
1069 (oldsfp->count + 1) *
1070 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
1071 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
1072 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
1073 /*
1074 * Reset our pointers, the data has moved.
1075 */
1076 oldsfp = (xfs_dir2_sf_hdr_t *)buf;
1077 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
1078 /*
1079 * Fill in the new header.
1080 */
1081 sfp->count = oldsfp->count;
1082 sfp->i8count = 0;
1083 dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
1084 /*
1085 * Copy the entries field by field.
1086 */
1087 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
1088 oldsfep = xfs_dir2_sf_firstentry(oldsfp);
1089 i < sfp->count;
1090 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
1091 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
1092 sfep->namelen = oldsfep->namelen;
1093 sfep->offset = oldsfep->offset;
1094 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1095 dp->d_ops->sf_put_ino(sfp, sfep,
1096 dp->d_ops->sf_get_ino(oldsfp, oldsfep));
1097 dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
1098 }
1099 /*
1100 * Clean up the inode.
1101 */
1102 kmem_free(buf);
1103 dp->i_d.di_size = newsize;
1104 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
1105}
1106
1107/*
1108 * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
1109 * The new entry w/ an 8-byte inode number is not there yet; we leave with
1110 * i8count set to 1, but no corresponding 8-byte entry.
1111 */
1112static void
1113xfs_dir2_sf_toino8(
1114 xfs_da_args_t *args) /* operation arguments */
1115{
1116 char *buf; /* old dir's buffer */
1117 xfs_inode_t *dp; /* incore directory inode */
1118 int i; /* entry index */
1119 int newsize; /* new inode size */
1120 xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
1121 xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
1122 int oldsize; /* old inode size */
1123 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1124 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
1125 struct xfs_mount *mp;
1126
1127 trace_xfs_dir2_sf_toino8(args);
1128
1129 dp = args->dp;
1130 mp = dp->i_mount;
1131
1132 /*
1133 * Copy the old directory to the buffer.
1134 * Then nuke it from the inode, and add the new buffer to the inode.
1135 * Don't want xfs_idata_realloc copying the data here.
1136 */
1137 oldsize = dp->i_df.if_bytes;
1138 buf = kmem_alloc(oldsize, KM_SLEEP);
1139 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
1140 ASSERT(oldsfp->i8count == 0);
1141 memcpy(buf, oldsfp, oldsize);
1142 /*
1143 * Compute the new inode size (nb: entry count + 1 for parent)
1144 */
1145 newsize =
1146 oldsize +
1147 (oldsfp->count + 1) *
1148 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
1149 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
1150 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
1151 /*
1152 * Reset our pointers, the data has moved.
1153 */
1154 oldsfp = (xfs_dir2_sf_hdr_t *)buf;
1155 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
1156 /*
1157 * Fill in the new header.
1158 */
1159 sfp->count = oldsfp->count;
1160 sfp->i8count = 1;
1161 dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
1162 /*
1163 * Copy the entries field by field.
1164 */
1165 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
1166 oldsfep = xfs_dir2_sf_firstentry(oldsfp);
1167 i < sfp->count;
1168 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
1169 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
1170 sfep->namelen = oldsfep->namelen;
1171 sfep->offset = oldsfep->offset;
1172 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1173 dp->d_ops->sf_put_ino(sfp, sfep,
1174 dp->d_ops->sf_get_ino(oldsfp, oldsfep));
1175 dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
1176 }
1177 /*
1178 * Clean up the inode.
1179 */
1180 kmem_free(buf);
1181 dp->i_d.di_size = newsize;
1182 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
1183}
1184#endif /* XFS_BIG_INUMS */
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
new file mode 100644
index 000000000000..bb969337efc8
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -0,0 +1,290 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_shared.h"
22#include "xfs_format.h"
23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_inode.h"
29#include "xfs_quota.h"
30#include "xfs_trans.h"
31#include "xfs_qm.h"
32#include "xfs_error.h"
33#include "xfs_cksum.h"
34#include "xfs_trace.h"
35
36int
37xfs_calc_dquots_per_chunk(
38 unsigned int nbblks) /* basic block units */
39{
40 unsigned int ndquots;
41
42 ASSERT(nbblks > 0);
43 ndquots = BBTOB(nbblks);
44 do_div(ndquots, sizeof(xfs_dqblk_t));
45
46 return ndquots;
47}
48
49/*
50 * Do some primitive error checking on ondisk dquot data structures.
51 */
52int
53xfs_dqcheck(
54 struct xfs_mount *mp,
55 xfs_disk_dquot_t *ddq,
56 xfs_dqid_t id,
57 uint type, /* used only when IO_dorepair is true */
58 uint flags,
59 char *str)
60{
61 xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
62 int errs = 0;
63
64 /*
65 * We can encounter an uninitialized dquot buffer for 2 reasons:
66 * 1. If we crash while deleting the quotainode(s), and those blks got
67 * used for user data. This is because we take the path of regular
68 * file deletion; however, the size field of quotainodes is never
69 * updated, so all the tricks that we play in itruncate_finish
70 * don't quite matter.
71 *
72 * 2. We don't play the quota buffers when there's a quotaoff logitem.
73 * But the allocation will be replayed so we'll end up with an
74 * uninitialized quota block.
75 *
76 * This is all fine; things are still consistent, and we haven't lost
77 * any quota information. Just don't complain about bad dquot blks.
78 */
79 if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
80 if (flags & XFS_QMOPT_DOWARN)
81 xfs_alert(mp,
82 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
83 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
84 errs++;
85 }
86 if (ddq->d_version != XFS_DQUOT_VERSION) {
87 if (flags & XFS_QMOPT_DOWARN)
88 xfs_alert(mp,
89 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
90 str, id, ddq->d_version, XFS_DQUOT_VERSION);
91 errs++;
92 }
93
94 if (ddq->d_flags != XFS_DQ_USER &&
95 ddq->d_flags != XFS_DQ_PROJ &&
96 ddq->d_flags != XFS_DQ_GROUP) {
97 if (flags & XFS_QMOPT_DOWARN)
98 xfs_alert(mp,
99 "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
100 str, id, ddq->d_flags);
101 errs++;
102 }
103
104 if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
105 if (flags & XFS_QMOPT_DOWARN)
106 xfs_alert(mp,
107 "%s : ondisk-dquot 0x%p, ID mismatch: "
108 "0x%x expected, found id 0x%x",
109 str, ddq, id, be32_to_cpu(ddq->d_id));
110 errs++;
111 }
112
113 if (!errs && ddq->d_id) {
114 if (ddq->d_blk_softlimit &&
115 be64_to_cpu(ddq->d_bcount) >
116 be64_to_cpu(ddq->d_blk_softlimit)) {
117 if (!ddq->d_btimer) {
118 if (flags & XFS_QMOPT_DOWARN)
119 xfs_alert(mp,
120 "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
121 str, (int)be32_to_cpu(ddq->d_id), ddq);
122 errs++;
123 }
124 }
125 if (ddq->d_ino_softlimit &&
126 be64_to_cpu(ddq->d_icount) >
127 be64_to_cpu(ddq->d_ino_softlimit)) {
128 if (!ddq->d_itimer) {
129 if (flags & XFS_QMOPT_DOWARN)
130 xfs_alert(mp,
131 "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
132 str, (int)be32_to_cpu(ddq->d_id), ddq);
133 errs++;
134 }
135 }
136 if (ddq->d_rtb_softlimit &&
137 be64_to_cpu(ddq->d_rtbcount) >
138 be64_to_cpu(ddq->d_rtb_softlimit)) {
139 if (!ddq->d_rtbtimer) {
140 if (flags & XFS_QMOPT_DOWARN)
141 xfs_alert(mp,
142 "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
143 str, (int)be32_to_cpu(ddq->d_id), ddq);
144 errs++;
145 }
146 }
147 }
148
149 if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
150 return errs;
151
152 if (flags & XFS_QMOPT_DOWARN)
153 xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
154
155 /*
156 * Typically, a repair is only requested by quotacheck.
157 */
158 ASSERT(id != -1);
159 ASSERT(flags & XFS_QMOPT_DQREPAIR);
160 memset(d, 0, sizeof(xfs_dqblk_t));
161
162 d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
163 d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
164 d->dd_diskdq.d_flags = type;
165 d->dd_diskdq.d_id = cpu_to_be32(id);
166
167 if (xfs_sb_version_hascrc(&mp->m_sb)) {
168 uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
169 xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
170 XFS_DQUOT_CRC_OFF);
171 }
172
173 return errs;
174}
175
176STATIC bool
177xfs_dquot_buf_verify_crc(
178 struct xfs_mount *mp,
179 struct xfs_buf *bp)
180{
181 struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
182 int ndquots;
183 int i;
184
185 if (!xfs_sb_version_hascrc(&mp->m_sb))
186 return true;
187
188 /*
189 * if we are in log recovery, the quota subsystem has not been
190 * initialised so we have no quotainfo structure. In that case, we need
191 * to manually calculate the number of dquots in the buffer.
192 */
193 if (mp->m_quotainfo)
194 ndquots = mp->m_quotainfo->qi_dqperchunk;
195 else
196 ndquots = xfs_calc_dquots_per_chunk(
197 XFS_BB_TO_FSB(mp, bp->b_length));
198
199 for (i = 0; i < ndquots; i++, d++) {
200 if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
201 XFS_DQUOT_CRC_OFF))
202 return false;
203 if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
204 return false;
205 }
206 return true;
207}
208
209STATIC bool
210xfs_dquot_buf_verify(
211 struct xfs_mount *mp,
212 struct xfs_buf *bp)
213{
214 struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
215 xfs_dqid_t id = 0;
216 int ndquots;
217 int i;
218
219 /*
220 * if we are in log recovery, the quota subsystem has not been
221 * initialised so we have no quotainfo structure. In that case, we need
222 * to manually calculate the number of dquots in the buffer.
223 */
224 if (mp->m_quotainfo)
225 ndquots = mp->m_quotainfo->qi_dqperchunk;
226 else
227 ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
228
229 /*
230 * On the first read of the buffer, verify that each dquot is valid.
231 * We don't know what the id of the dquot is supposed to be, just that
232 * they should be increasing monotonically within the buffer. If the
233 * first id is corrupt, then it will fail on the second dquot in the
234 * buffer so corruptions could point to the wrong dquot in this case.
235 */
236 for (i = 0; i < ndquots; i++) {
237 struct xfs_disk_dquot *ddq;
238 int error;
239
240 ddq = &d[i].dd_diskdq;
241
242 if (i == 0)
243 id = be32_to_cpu(ddq->d_id);
244
245 error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
246 "xfs_dquot_buf_verify");
247 if (error)
248 return false;
249 }
250 return true;
251}
252
253static void
254xfs_dquot_buf_read_verify(
255 struct xfs_buf *bp)
256{
257 struct xfs_mount *mp = bp->b_target->bt_mount;
258
259 if (!xfs_dquot_buf_verify_crc(mp, bp))
260 xfs_buf_ioerror(bp, -EFSBADCRC);
261 else if (!xfs_dquot_buf_verify(mp, bp))
262 xfs_buf_ioerror(bp, -EFSCORRUPTED);
263
264 if (bp->b_error)
265 xfs_verifier_error(bp);
266}
267
268/*
269 * we don't calculate the CRC here as that is done when the dquot is flushed to
270 * the buffer after the update is done. This ensures that the dquot in the
271 * buffer always has an up-to-date CRC value.
272 */
273static void
274xfs_dquot_buf_write_verify(
275 struct xfs_buf *bp)
276{
277 struct xfs_mount *mp = bp->b_target->bt_mount;
278
279 if (!xfs_dquot_buf_verify(mp, bp)) {
280 xfs_buf_ioerror(bp, -EFSCORRUPTED);
281 xfs_verifier_error(bp);
282 return;
283 }
284}
285
286const struct xfs_buf_ops xfs_dquot_buf_ops = {
287 .verify_read = xfs_dquot_buf_read_verify,
288 .verify_write = xfs_dquot_buf_write_verify,
289};
290
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
new file mode 100644
index 000000000000..34d85aca3058
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -0,0 +1,428 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_FORMAT_H__
19#define __XFS_FORMAT_H__
20
21/*
22 * XFS On Disk Format Definitions
23 *
24 * This header file defines all the on-disk format definitions for
25 * general XFS objects. Directory and attribute related objects are defined in
26 * xfs_da_format.h, which log and log item formats are defined in
27 * xfs_log_format.h. Everything else goes here.
28 */
29
30struct xfs_mount;
31struct xfs_trans;
32struct xfs_inode;
33struct xfs_buf;
34struct xfs_ifork;
35
36/*
37 * RealTime Device format definitions
38 */
39
40/* Min and max rt extent sizes, specified in bytes */
41#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
42#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
43#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
44
45#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
46#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
47#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
48#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
49
50/*
51 * RT Summary and bit manipulation macros.
52 */
53#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
54#define XFS_SUMOFFSTOBLOCK(mp,s) \
55 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
56#define XFS_SUMPTR(mp,bp,so) \
57 ((xfs_suminfo_t *)((bp)->b_addr + \
58 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
59
60#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
61#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
62#define XFS_BITTOWORD(mp,bi) \
63 ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
64
65#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
66#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
67
68#define XFS_RTLOBIT(w) xfs_lowbit32(w)
69#define XFS_RTHIBIT(w) xfs_highbit32(w)
70
71#if XFS_BIG_BLKNOS
72#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
73#else
74#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
75#endif
76
77/*
78 * Dquot and dquot block format definitions
79 */
80#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
81#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
82
83/*
84 * This is the main portion of the on-disk representation of quota
85 * information for a user. This is the q_core of the xfs_dquot_t that
86 * is kept in kernel memory. We pad this with some more expansion room
87 * to construct the on disk structure.
88 */
89typedef struct xfs_disk_dquot {
90 __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
91 __u8 d_version; /* dquot version */
92 __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
93 __be32 d_id; /* user,project,group id */
94 __be64 d_blk_hardlimit;/* absolute limit on disk blks */
95 __be64 d_blk_softlimit;/* preferred limit on disk blks */
96 __be64 d_ino_hardlimit;/* maximum # allocated inodes */
97 __be64 d_ino_softlimit;/* preferred inode limit */
98 __be64 d_bcount; /* disk blocks owned by the user */
99 __be64 d_icount; /* inodes owned by the user */
100 __be32 d_itimer; /* zero if within inode limits if not,
101 this is when we refuse service */
102 __be32 d_btimer; /* similar to above; for disk blocks */
103 __be16 d_iwarns; /* warnings issued wrt num inodes */
104 __be16 d_bwarns; /* warnings issued wrt disk blocks */
105 __be32 d_pad0; /* 64 bit align */
106 __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
107 __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
108 __be64 d_rtbcount; /* realtime blocks owned */
109 __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
110 __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
111 __be16 d_pad;
112} xfs_disk_dquot_t;
113
114/*
115 * This is what goes on disk. This is separated from the xfs_disk_dquot because
116 * carrying the unnecessary padding would be a waste of memory.
117 */
118typedef struct xfs_dqblk {
119 xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
120 char dd_fill[4]; /* filling for posterity */
121
122 /*
123 * These two are only present on filesystems with the CRC bits set.
124 */
125 __be32 dd_crc; /* checksum */
126 __be64 dd_lsn; /* last modification in log */
127 uuid_t dd_uuid; /* location information */
128} xfs_dqblk_t;
129
130#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
131
132/*
133 * Remote symlink format and access functions.
134 */
135#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
136
137struct xfs_dsymlink_hdr {
138 __be32 sl_magic;
139 __be32 sl_offset;
140 __be32 sl_bytes;
141 __be32 sl_crc;
142 uuid_t sl_uuid;
143 __be64 sl_owner;
144 __be64 sl_blkno;
145 __be64 sl_lsn;
146};
147
148#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc)
149
150/*
151 * The maximum pathlen is 1024 bytes. Since the minimum file system
152 * blocksize is 512 bytes, we can get a max of 3 extents back from
153 * bmapi when crc headers are taken into account.
154 */
155#define XFS_SYMLINK_MAPS 3
156
157#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
158 ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
159 sizeof(struct xfs_dsymlink_hdr) : 0))
160
161
162/*
163 * Allocation Btree format definitions
164 *
165 * There are two on-disk btrees, one sorted by blockno and one sorted
166 * by blockcount and blockno. All blocks look the same to make the code
167 * simpler; if we have time later, we'll make the optimizations.
168 */
169#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
170#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */
171#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
172#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */
173
174/*
175 * Data record/key structure
176 */
177typedef struct xfs_alloc_rec {
178 __be32 ar_startblock; /* starting block number */
179 __be32 ar_blockcount; /* count of free blocks */
180} xfs_alloc_rec_t, xfs_alloc_key_t;
181
182typedef struct xfs_alloc_rec_incore {
183 xfs_agblock_t ar_startblock; /* starting block number */
184 xfs_extlen_t ar_blockcount; /* count of free blocks */
185} xfs_alloc_rec_incore_t;
186
187/* btree pointer type */
188typedef __be32 xfs_alloc_ptr_t;
189
190/*
191 * Block numbers in the AG:
192 * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
193 */
194#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
195#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
196
197
198/*
199 * Inode Allocation Btree format definitions
200 *
201 * There is a btree for the inode map per allocation group.
202 */
203#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
204#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */
205#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */
206#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */
207
208typedef __uint64_t xfs_inofree_t;
209#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
210#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
211#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
212#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
213
214static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
215{
216 return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
217}
218
219/*
220 * Data record structure
221 */
222typedef struct xfs_inobt_rec {
223 __be32 ir_startino; /* starting inode number */
224 __be32 ir_freecount; /* count of free inodes (set bits) */
225 __be64 ir_free; /* free inode mask */
226} xfs_inobt_rec_t;
227
228typedef struct xfs_inobt_rec_incore {
229 xfs_agino_t ir_startino; /* starting inode number */
230 __int32_t ir_freecount; /* count of free inodes (set bits) */
231 xfs_inofree_t ir_free; /* free inode mask */
232} xfs_inobt_rec_incore_t;
233
234
235/*
236 * Key structure
237 */
238typedef struct xfs_inobt_key {
239 __be32 ir_startino; /* starting inode number */
240} xfs_inobt_key_t;
241
242/* btree pointer type */
243typedef __be32 xfs_inobt_ptr_t;
244
245/*
246 * block numbers in the AG.
247 */
248#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
249#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
250
251/*
252 * The first data block of an AG depends on whether the filesystem was formatted
253 * with the finobt feature. If so, account for the finobt reserved root btree
254 * block.
255 */
256#define XFS_PREALLOC_BLOCKS(mp) \
257 (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
258 XFS_FIBT_BLOCK(mp) + 1 : \
259 XFS_IBT_BLOCK(mp) + 1)
260
261
262
263/*
264 * BMAP Btree format definitions
265 *
266 * This includes both the root block definition that sits inside an inode fork
267 * and the record/pointer formats for the leaf/node in the blocks.
268 */
269#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
270#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */
271
272/*
273 * Bmap root header, on-disk form only.
274 */
275typedef struct xfs_bmdr_block {
276 __be16 bb_level; /* 0 is a leaf */
277 __be16 bb_numrecs; /* current # of data records */
278} xfs_bmdr_block_t;
279
280/*
281 * Bmap btree record and extent descriptor.
282 * l0:63 is an extent flag (value 1 indicates non-normal).
283 * l0:9-62 are startoff.
284 * l0:0-8 and l1:21-63 are startblock.
285 * l1:0-20 are blockcount.
286 */
287#define BMBT_EXNTFLAG_BITLEN 1
288#define BMBT_STARTOFF_BITLEN 54
289#define BMBT_STARTBLOCK_BITLEN 52
290#define BMBT_BLOCKCOUNT_BITLEN 21
291
292typedef struct xfs_bmbt_rec {
293 __be64 l0, l1;
294} xfs_bmbt_rec_t;
295
296typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
297typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
298
299typedef struct xfs_bmbt_rec_host {
300 __uint64_t l0, l1;
301} xfs_bmbt_rec_host_t;
302
303/*
304 * Values and macros for delayed-allocation startblock fields.
305 */
306#define STARTBLOCKVALBITS 17
307#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
308#define DSTARTBLOCKMASKBITS (15 + 20)
309#define STARTBLOCKMASK \
310 (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
311#define DSTARTBLOCKMASK \
312 (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
313
314static inline int isnullstartblock(xfs_fsblock_t x)
315{
316 return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
317}
318
319static inline int isnulldstartblock(xfs_dfsbno_t x)
320{
321 return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
322}
323
324static inline xfs_fsblock_t nullstartblock(int k)
325{
326 ASSERT(k < (1 << STARTBLOCKVALBITS));
327 return STARTBLOCKMASK | (k);
328}
329
330static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
331{
332 return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
333}
334
335/*
336 * Possible extent formats.
337 */
338typedef enum {
339 XFS_EXTFMT_NOSTATE = 0,
340 XFS_EXTFMT_HASSTATE
341} xfs_exntfmt_t;
342
343/*
344 * Possible extent states.
345 */
346typedef enum {
347 XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
348 XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
349} xfs_exntst_t;
350
351/*
352 * Incore version of above.
353 */
354typedef struct xfs_bmbt_irec
355{
356 xfs_fileoff_t br_startoff; /* starting file offset */
357 xfs_fsblock_t br_startblock; /* starting block number */
358 xfs_filblks_t br_blockcount; /* number of blocks */
359 xfs_exntst_t br_state; /* extent state */
360} xfs_bmbt_irec_t;
361
362/*
363 * Key structure for non-leaf levels of the tree.
364 */
365typedef struct xfs_bmbt_key {
366 __be64 br_startoff; /* starting file offset */
367} xfs_bmbt_key_t, xfs_bmdr_key_t;
368
369/* btree pointer type */
370typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
371
372
373/*
374 * Generic Btree block format definitions
375 *
376 * This is a combination of the actual format used on disk for short and long
377 * format btrees. The first three fields are shared by both format, but the
378 * pointers are different and should be used with care.
379 *
380 * To get the size of the actual short or long form headers please use the size
381 * macros below. Never use sizeof(xfs_btree_block).
382 *
383 * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
384 * with the crc feature bit, and all accesses to them must be conditional on
385 * that flag.
386 */
387struct xfs_btree_block {
388 __be32 bb_magic; /* magic number for block type */
389 __be16 bb_level; /* 0 is a leaf */
390 __be16 bb_numrecs; /* current # of data records */
391 union {
392 struct {
393 __be32 bb_leftsib;
394 __be32 bb_rightsib;
395
396 __be64 bb_blkno;
397 __be64 bb_lsn;
398 uuid_t bb_uuid;
399 __be32 bb_owner;
400 __le32 bb_crc;
401 } s; /* short form pointers */
402 struct {
403 __be64 bb_leftsib;
404 __be64 bb_rightsib;
405
406 __be64 bb_blkno;
407 __be64 bb_lsn;
408 uuid_t bb_uuid;
409 __be64 bb_owner;
410 __le32 bb_crc;
411 __be32 bb_pad; /* padding for alignment */
412 } l; /* long form pointers */
413 } bb_u; /* rest */
414};
415
416#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
417#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
418
419/* sizes of CRC enabled btree blocks */
420#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
421#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
422
423#define XFS_BTREE_SBLOCK_CRC_OFF \
424 offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
425#define XFS_BTREE_LBLOCK_CRC_OFF \
426 offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
427
428#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
new file mode 100644
index 000000000000..b62771f1f4b5
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -0,0 +1,2189 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_inum.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_inode.h"
30#include "xfs_btree.h"
31#include "xfs_ialloc.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_alloc.h"
34#include "xfs_rtalloc.h"
35#include "xfs_error.h"
36#include "xfs_bmap.h"
37#include "xfs_cksum.h"
38#include "xfs_trans.h"
39#include "xfs_buf_item.h"
40#include "xfs_icreate_item.h"
41#include "xfs_icache.h"
42#include "xfs_dinode.h"
43#include "xfs_trace.h"
44
45
46/*
47 * Allocation group level functions.
48 */
49static inline int
50xfs_ialloc_cluster_alignment(
51 xfs_alloc_arg_t *args)
52{
53 if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
54 args->mp->m_sb.sb_inoalignmt >=
55 XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
56 return args->mp->m_sb.sb_inoalignmt;
57 return 1;
58}
59
60/*
61 * Lookup a record by ino in the btree given by cur.
62 */
63int /* error */
64xfs_inobt_lookup(
65 struct xfs_btree_cur *cur, /* btree cursor */
66 xfs_agino_t ino, /* starting inode of chunk */
67 xfs_lookup_t dir, /* <=, >=, == */
68 int *stat) /* success/failure */
69{
70 cur->bc_rec.i.ir_startino = ino;
71 cur->bc_rec.i.ir_freecount = 0;
72 cur->bc_rec.i.ir_free = 0;
73 return xfs_btree_lookup(cur, dir, stat);
74}
75
76/*
77 * Update the record referred to by cur to the value given.
78 * This either works (return 0) or gets an EFSCORRUPTED error.
79 */
80STATIC int /* error */
81xfs_inobt_update(
82 struct xfs_btree_cur *cur, /* btree cursor */
83 xfs_inobt_rec_incore_t *irec) /* btree record */
84{
85 union xfs_btree_rec rec;
86
87 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
88 rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
89 rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
90 return xfs_btree_update(cur, &rec);
91}
92
93/*
94 * Get the data from the pointed-to record.
95 */
96int /* error */
97xfs_inobt_get_rec(
98 struct xfs_btree_cur *cur, /* btree cursor */
99 xfs_inobt_rec_incore_t *irec, /* btree record */
100 int *stat) /* output: success/failure */
101{
102 union xfs_btree_rec *rec;
103 int error;
104
105 error = xfs_btree_get_rec(cur, &rec, stat);
106 if (!error && *stat == 1) {
107 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
108 irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
109 irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
110 }
111 return error;
112}
113
114/*
115 * Insert a single inobt record. Cursor must already point to desired location.
116 */
117STATIC int
118xfs_inobt_insert_rec(
119 struct xfs_btree_cur *cur,
120 __int32_t freecount,
121 xfs_inofree_t free,
122 int *stat)
123{
124 cur->bc_rec.i.ir_freecount = freecount;
125 cur->bc_rec.i.ir_free = free;
126 return xfs_btree_insert(cur, stat);
127}
128
129/*
130 * Insert records describing a newly allocated inode chunk into the inobt.
131 */
132STATIC int
133xfs_inobt_insert(
134 struct xfs_mount *mp,
135 struct xfs_trans *tp,
136 struct xfs_buf *agbp,
137 xfs_agino_t newino,
138 xfs_agino_t newlen,
139 xfs_btnum_t btnum)
140{
141 struct xfs_btree_cur *cur;
142 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
143 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
144 xfs_agino_t thisino;
145 int i;
146 int error;
147
148 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
149
150 for (thisino = newino;
151 thisino < newino + newlen;
152 thisino += XFS_INODES_PER_CHUNK) {
153 error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
154 if (error) {
155 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
156 return error;
157 }
158 ASSERT(i == 0);
159
160 error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
161 XFS_INOBT_ALL_FREE, &i);
162 if (error) {
163 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
164 return error;
165 }
166 ASSERT(i == 1);
167 }
168
169 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
170
171 return 0;
172}
173
174/*
175 * Verify that the number of free inodes in the AGI is correct.
176 */
177#ifdef DEBUG
178STATIC int
179xfs_check_agi_freecount(
180 struct xfs_btree_cur *cur,
181 struct xfs_agi *agi)
182{
183 if (cur->bc_nlevels == 1) {
184 xfs_inobt_rec_incore_t rec;
185 int freecount = 0;
186 int error;
187 int i;
188
189 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
190 if (error)
191 return error;
192
193 do {
194 error = xfs_inobt_get_rec(cur, &rec, &i);
195 if (error)
196 return error;
197
198 if (i) {
199 freecount += rec.ir_freecount;
200 error = xfs_btree_increment(cur, 0, &i);
201 if (error)
202 return error;
203 }
204 } while (i == 1);
205
206 if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
207 ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
208 }
209 return 0;
210}
211#else
212#define xfs_check_agi_freecount(cur, agi) 0
213#endif
214
215/*
216 * Initialise a new set of inodes. When called without a transaction context
217 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
218 * than logging them (which in a transaction context puts them into the AIL
219 * for writeback rather than the xfsbufd queue).
220 */
221int
222xfs_ialloc_inode_init(
223 struct xfs_mount *mp,
224 struct xfs_trans *tp,
225 struct list_head *buffer_list,
226 xfs_agnumber_t agno,
227 xfs_agblock_t agbno,
228 xfs_agblock_t length,
229 unsigned int gen)
230{
231 struct xfs_buf *fbuf;
232 struct xfs_dinode *free;
233 int nbufs, blks_per_cluster, inodes_per_cluster;
234 int version;
235 int i, j;
236 xfs_daddr_t d;
237 xfs_ino_t ino = 0;
238
239 /*
240 * Loop over the new block(s), filling in the inodes. For small block
241 * sizes, manipulate the inodes in buffers which are multiples of the
242 * blocks size.
243 */
244 blks_per_cluster = xfs_icluster_size_fsb(mp);
245 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
246 nbufs = length / blks_per_cluster;
247
248 /*
249 * Figure out what version number to use in the inodes we create. If
250 * the superblock version has caught up to the one that supports the new
251 * inode format, then use the new inode version. Otherwise use the old
252 * version so that old kernels will continue to be able to use the file
253 * system.
254 *
255 * For v3 inodes, we also need to write the inode number into the inode,
256 * so calculate the first inode number of the chunk here as
257 * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
258 * across multiple filesystem blocks (such as a cluster) and so cannot
259 * be used in the cluster buffer loop below.
260 *
261 * Further, because we are writing the inode directly into the buffer
262 * and calculating a CRC on the entire inode, we have ot log the entire
263 * inode so that the entire range the CRC covers is present in the log.
264 * That means for v3 inode we log the entire buffer rather than just the
265 * inode cores.
266 */
267 if (xfs_sb_version_hascrc(&mp->m_sb)) {
268 version = 3;
269 ino = XFS_AGINO_TO_INO(mp, agno,
270 XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
271
272 /*
273 * log the initialisation that is about to take place as an
274 * logical operation. This means the transaction does not
275 * need to log the physical changes to the inode buffers as log
276 * recovery will know what initialisation is actually needed.
277 * Hence we only need to log the buffers as "ordered" buffers so
278 * they track in the AIL as if they were physically logged.
279 */
280 if (tp)
281 xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
282 mp->m_sb.sb_inodesize, length, gen);
283 } else
284 version = 2;
285
286 for (j = 0; j < nbufs; j++) {
287 /*
288 * Get the block.
289 */
290 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
291 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
292 mp->m_bsize * blks_per_cluster,
293 XBF_UNMAPPED);
294 if (!fbuf)
295 return -ENOMEM;
296
297 /* Initialize the inode buffers and log them appropriately. */
298 fbuf->b_ops = &xfs_inode_buf_ops;
299 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
300 for (i = 0; i < inodes_per_cluster; i++) {
301 int ioffset = i << mp->m_sb.sb_inodelog;
302 uint isize = xfs_dinode_size(version);
303
304 free = xfs_make_iptr(mp, fbuf, i);
305 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
306 free->di_version = version;
307 free->di_gen = cpu_to_be32(gen);
308 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
309
310 if (version == 3) {
311 free->di_ino = cpu_to_be64(ino);
312 ino++;
313 uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
314 xfs_dinode_calc_crc(mp, free);
315 } else if (tp) {
316 /* just log the inode core */
317 xfs_trans_log_buf(tp, fbuf, ioffset,
318 ioffset + isize - 1);
319 }
320 }
321
322 if (tp) {
323 /*
324 * Mark the buffer as an inode allocation buffer so it
325 * sticks in AIL at the point of this allocation
326 * transaction. This ensures the they are on disk before
327 * the tail of the log can be moved past this
328 * transaction (i.e. by preventing relogging from moving
329 * it forward in the log).
330 */
331 xfs_trans_inode_alloc_buf(tp, fbuf);
332 if (version == 3) {
333 /*
334 * Mark the buffer as ordered so that they are
335 * not physically logged in the transaction but
336 * still tracked in the AIL as part of the
337 * transaction and pin the log appropriately.
338 */
339 xfs_trans_ordered_buf(tp, fbuf);
340 xfs_trans_log_buf(tp, fbuf, 0,
341 BBTOB(fbuf->b_length) - 1);
342 }
343 } else {
344 fbuf->b_flags |= XBF_DONE;
345 xfs_buf_delwri_queue(fbuf, buffer_list);
346 xfs_buf_relse(fbuf);
347 }
348 }
349 return 0;
350}
351
352/*
353 * Allocate new inodes in the allocation group specified by agbp.
354 * Return 0 for success, else error code.
355 */
356STATIC int /* error code or 0 */
357xfs_ialloc_ag_alloc(
358 xfs_trans_t *tp, /* transaction pointer */
359 xfs_buf_t *agbp, /* alloc group buffer */
360 int *alloc)
361{
362 xfs_agi_t *agi; /* allocation group header */
363 xfs_alloc_arg_t args; /* allocation argument structure */
364 xfs_agnumber_t agno;
365 int error;
366 xfs_agino_t newino; /* new first inode's number */
367 xfs_agino_t newlen; /* new number of inodes */
368 int isaligned = 0; /* inode allocation at stripe unit */
369 /* boundary */
370 struct xfs_perag *pag;
371
372 memset(&args, 0, sizeof(args));
373 args.tp = tp;
374 args.mp = tp->t_mountp;
375
376 /*
377 * Locking will ensure that we don't have two callers in here
378 * at one time.
379 */
380 newlen = args.mp->m_ialloc_inos;
381 if (args.mp->m_maxicount &&
382 args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
383 return -ENOSPC;
384 args.minlen = args.maxlen = args.mp->m_ialloc_blks;
385 /*
386 * First try to allocate inodes contiguous with the last-allocated
387 * chunk of inodes. If the filesystem is striped, this will fill
388 * an entire stripe unit with inodes.
389 */
390 agi = XFS_BUF_TO_AGI(agbp);
391 newino = be32_to_cpu(agi->agi_newino);
392 agno = be32_to_cpu(agi->agi_seqno);
393 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
394 args.mp->m_ialloc_blks;
395 if (likely(newino != NULLAGINO &&
396 (args.agbno < be32_to_cpu(agi->agi_length)))) {
397 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
398 args.type = XFS_ALLOCTYPE_THIS_BNO;
399 args.prod = 1;
400
401 /*
402 * We need to take into account alignment here to ensure that
403 * we don't modify the free list if we fail to have an exact
404 * block. If we don't have an exact match, and every oher
405 * attempt allocation attempt fails, we'll end up cancelling
406 * a dirty transaction and shutting down.
407 *
408 * For an exact allocation, alignment must be 1,
409 * however we need to take cluster alignment into account when
410 * fixing up the freelist. Use the minalignslop field to
411 * indicate that extra blocks might be required for alignment,
412 * but not to use them in the actual exact allocation.
413 */
414 args.alignment = 1;
415 args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
416
417 /* Allow space for the inode btree to split. */
418 args.minleft = args.mp->m_in_maxlevels - 1;
419 if ((error = xfs_alloc_vextent(&args)))
420 return error;
421
422 /*
423 * This request might have dirtied the transaction if the AG can
424 * satisfy the request, but the exact block was not available.
425 * If the allocation did fail, subsequent requests will relax
426 * the exact agbno requirement and increase the alignment
427 * instead. It is critical that the total size of the request
428 * (len + alignment + slop) does not increase from this point
429 * on, so reset minalignslop to ensure it is not included in
430 * subsequent requests.
431 */
432 args.minalignslop = 0;
433 } else
434 args.fsbno = NULLFSBLOCK;
435
436 if (unlikely(args.fsbno == NULLFSBLOCK)) {
437 /*
438 * Set the alignment for the allocation.
439 * If stripe alignment is turned on then align at stripe unit
440 * boundary.
441 * If the cluster size is smaller than a filesystem block
442 * then we're doing I/O for inodes in filesystem block size
443 * pieces, so don't need alignment anyway.
444 */
445 isaligned = 0;
446 if (args.mp->m_sinoalign) {
447 ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
448 args.alignment = args.mp->m_dalign;
449 isaligned = 1;
450 } else
451 args.alignment = xfs_ialloc_cluster_alignment(&args);
452 /*
453 * Need to figure out where to allocate the inode blocks.
454 * Ideally they should be spaced out through the a.g.
455 * For now, just allocate blocks up front.
456 */
457 args.agbno = be32_to_cpu(agi->agi_root);
458 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
459 /*
460 * Allocate a fixed-size extent of inodes.
461 */
462 args.type = XFS_ALLOCTYPE_NEAR_BNO;
463 args.prod = 1;
464 /*
465 * Allow space for the inode btree to split.
466 */
467 args.minleft = args.mp->m_in_maxlevels - 1;
468 if ((error = xfs_alloc_vextent(&args)))
469 return error;
470 }
471
472 /*
473 * If stripe alignment is turned on, then try again with cluster
474 * alignment.
475 */
476 if (isaligned && args.fsbno == NULLFSBLOCK) {
477 args.type = XFS_ALLOCTYPE_NEAR_BNO;
478 args.agbno = be32_to_cpu(agi->agi_root);
479 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
480 args.alignment = xfs_ialloc_cluster_alignment(&args);
481 if ((error = xfs_alloc_vextent(&args)))
482 return error;
483 }
484
485 if (args.fsbno == NULLFSBLOCK) {
486 *alloc = 0;
487 return 0;
488 }
489 ASSERT(args.len == args.minlen);
490
491 /*
492 * Stamp and write the inode buffers.
493 *
494 * Seed the new inode cluster with a random generation number. This
495 * prevents short-term reuse of generation numbers if a chunk is
496 * freed and then immediately reallocated. We use random numbers
497 * rather than a linear progression to prevent the next generation
498 * number from being easily guessable.
499 */
500 error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
501 args.len, prandom_u32());
502
503 if (error)
504 return error;
505 /*
506 * Convert the results.
507 */
508 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
509 be32_add_cpu(&agi->agi_count, newlen);
510 be32_add_cpu(&agi->agi_freecount, newlen);
511 pag = xfs_perag_get(args.mp, agno);
512 pag->pagi_freecount += newlen;
513 xfs_perag_put(pag);
514 agi->agi_newino = cpu_to_be32(newino);
515
516 /*
517 * Insert records describing the new inode chunk into the btrees.
518 */
519 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
520 XFS_BTNUM_INO);
521 if (error)
522 return error;
523
524 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
525 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
526 XFS_BTNUM_FINO);
527 if (error)
528 return error;
529 }
530 /*
531 * Log allocation group header fields
532 */
533 xfs_ialloc_log_agi(tp, agbp,
534 XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
535 /*
536 * Modify/log superblock values for inode count and inode free count.
537 */
538 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
539 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
540 *alloc = 1;
541 return 0;
542}
543
544STATIC xfs_agnumber_t
545xfs_ialloc_next_ag(
546 xfs_mount_t *mp)
547{
548 xfs_agnumber_t agno;
549
550 spin_lock(&mp->m_agirotor_lock);
551 agno = mp->m_agirotor;
552 if (++mp->m_agirotor >= mp->m_maxagi)
553 mp->m_agirotor = 0;
554 spin_unlock(&mp->m_agirotor_lock);
555
556 return agno;
557}
558
559/*
560 * Select an allocation group to look for a free inode in, based on the parent
561 * inode and the mode. Return the allocation group buffer.
562 */
563STATIC xfs_agnumber_t
564xfs_ialloc_ag_select(
565 xfs_trans_t *tp, /* transaction pointer */
566 xfs_ino_t parent, /* parent directory inode number */
567 umode_t mode, /* bits set to indicate file type */
568 int okalloc) /* ok to allocate more space */
569{
570 xfs_agnumber_t agcount; /* number of ag's in the filesystem */
571 xfs_agnumber_t agno; /* current ag number */
572 int flags; /* alloc buffer locking flags */
573 xfs_extlen_t ineed; /* blocks needed for inode allocation */
574 xfs_extlen_t longest = 0; /* longest extent available */
575 xfs_mount_t *mp; /* mount point structure */
576 int needspace; /* file mode implies space allocated */
577 xfs_perag_t *pag; /* per allocation group data */
578 xfs_agnumber_t pagno; /* parent (starting) ag number */
579 int error;
580
581 /*
582 * Files of these types need at least one block if length > 0
583 * (and they won't fit in the inode, but that's hard to figure out).
584 */
585 needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
586 mp = tp->t_mountp;
587 agcount = mp->m_maxagi;
588 if (S_ISDIR(mode))
589 pagno = xfs_ialloc_next_ag(mp);
590 else {
591 pagno = XFS_INO_TO_AGNO(mp, parent);
592 if (pagno >= agcount)
593 pagno = 0;
594 }
595
596 ASSERT(pagno < agcount);
597
598 /*
599 * Loop through allocation groups, looking for one with a little
600 * free space in it. Note we don't look for free inodes, exactly.
601 * Instead, we include whether there is a need to allocate inodes
602 * to mean that blocks must be allocated for them,
603 * if none are currently free.
604 */
605 agno = pagno;
606 flags = XFS_ALLOC_FLAG_TRYLOCK;
607 for (;;) {
608 pag = xfs_perag_get(mp, agno);
609 if (!pag->pagi_inodeok) {
610 xfs_ialloc_next_ag(mp);
611 goto nextag;
612 }
613
614 if (!pag->pagi_init) {
615 error = xfs_ialloc_pagi_init(mp, tp, agno);
616 if (error)
617 goto nextag;
618 }
619
620 if (pag->pagi_freecount) {
621 xfs_perag_put(pag);
622 return agno;
623 }
624
625 if (!okalloc)
626 goto nextag;
627
628 if (!pag->pagf_init) {
629 error = xfs_alloc_pagf_init(mp, tp, agno, flags);
630 if (error)
631 goto nextag;
632 }
633
634 /*
635 * Is there enough free space for the file plus a block of
636 * inodes? (if we need to allocate some)?
637 */
638 ineed = mp->m_ialloc_blks;
639 longest = pag->pagf_longest;
640 if (!longest)
641 longest = pag->pagf_flcount > 0;
642
643 if (pag->pagf_freeblks >= needspace + ineed &&
644 longest >= ineed) {
645 xfs_perag_put(pag);
646 return agno;
647 }
648nextag:
649 xfs_perag_put(pag);
650 /*
651 * No point in iterating over the rest, if we're shutting
652 * down.
653 */
654 if (XFS_FORCED_SHUTDOWN(mp))
655 return NULLAGNUMBER;
656 agno++;
657 if (agno >= agcount)
658 agno = 0;
659 if (agno == pagno) {
660 if (flags == 0)
661 return NULLAGNUMBER;
662 flags = 0;
663 }
664 }
665}
666
667/*
668 * Try to retrieve the next record to the left/right from the current one.
669 */
670STATIC int
671xfs_ialloc_next_rec(
672 struct xfs_btree_cur *cur,
673 xfs_inobt_rec_incore_t *rec,
674 int *done,
675 int left)
676{
677 int error;
678 int i;
679
680 if (left)
681 error = xfs_btree_decrement(cur, 0, &i);
682 else
683 error = xfs_btree_increment(cur, 0, &i);
684
685 if (error)
686 return error;
687 *done = !i;
688 if (i) {
689 error = xfs_inobt_get_rec(cur, rec, &i);
690 if (error)
691 return error;
692 XFS_WANT_CORRUPTED_RETURN(i == 1);
693 }
694
695 return 0;
696}
697
698STATIC int
699xfs_ialloc_get_rec(
700 struct xfs_btree_cur *cur,
701 xfs_agino_t agino,
702 xfs_inobt_rec_incore_t *rec,
703 int *done)
704{
705 int error;
706 int i;
707
708 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
709 if (error)
710 return error;
711 *done = !i;
712 if (i) {
713 error = xfs_inobt_get_rec(cur, rec, &i);
714 if (error)
715 return error;
716 XFS_WANT_CORRUPTED_RETURN(i == 1);
717 }
718
719 return 0;
720}
721
722/*
723 * Allocate an inode using the inobt-only algorithm.
724 */
725STATIC int
726xfs_dialloc_ag_inobt(
727 struct xfs_trans *tp,
728 struct xfs_buf *agbp,
729 xfs_ino_t parent,
730 xfs_ino_t *inop)
731{
732 struct xfs_mount *mp = tp->t_mountp;
733 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
734 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
735 xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
736 xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
737 struct xfs_perag *pag;
738 struct xfs_btree_cur *cur, *tcur;
739 struct xfs_inobt_rec_incore rec, trec;
740 xfs_ino_t ino;
741 int error;
742 int offset;
743 int i, j;
744
745 pag = xfs_perag_get(mp, agno);
746
747 ASSERT(pag->pagi_init);
748 ASSERT(pag->pagi_inodeok);
749 ASSERT(pag->pagi_freecount > 0);
750
751 restart_pagno:
752 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
753 /*
754 * If pagino is 0 (this is the root inode allocation) use newino.
755 * This must work because we've just allocated some.
756 */
757 if (!pagino)
758 pagino = be32_to_cpu(agi->agi_newino);
759
760 error = xfs_check_agi_freecount(cur, agi);
761 if (error)
762 goto error0;
763
764 /*
765 * If in the same AG as the parent, try to get near the parent.
766 */
767 if (pagno == agno) {
768 int doneleft; /* done, to the left */
769 int doneright; /* done, to the right */
770 int searchdistance = 10;
771
772 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
773 if (error)
774 goto error0;
775 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
776
777 error = xfs_inobt_get_rec(cur, &rec, &j);
778 if (error)
779 goto error0;
780 XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
781
782 if (rec.ir_freecount > 0) {
783 /*
784 * Found a free inode in the same chunk
785 * as the parent, done.
786 */
787 goto alloc_inode;
788 }
789
790
791 /*
792 * In the same AG as parent, but parent's chunk is full.
793 */
794
795 /* duplicate the cursor, search left & right simultaneously */
796 error = xfs_btree_dup_cursor(cur, &tcur);
797 if (error)
798 goto error0;
799
800 /*
801 * Skip to last blocks looked up if same parent inode.
802 */
803 if (pagino != NULLAGINO &&
804 pag->pagl_pagino == pagino &&
805 pag->pagl_leftrec != NULLAGINO &&
806 pag->pagl_rightrec != NULLAGINO) {
807 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
808 &trec, &doneleft);
809 if (error)
810 goto error1;
811
812 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
813 &rec, &doneright);
814 if (error)
815 goto error1;
816 } else {
817 /* search left with tcur, back up 1 record */
818 error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
819 if (error)
820 goto error1;
821
822 /* search right with cur, go forward 1 record. */
823 error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
824 if (error)
825 goto error1;
826 }
827
828 /*
829 * Loop until we find an inode chunk with a free inode.
830 */
831 while (!doneleft || !doneright) {
832 int useleft; /* using left inode chunk this time */
833
834 if (!--searchdistance) {
835 /*
836 * Not in range - save last search
837 * location and allocate a new inode
838 */
839 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
840 pag->pagl_leftrec = trec.ir_startino;
841 pag->pagl_rightrec = rec.ir_startino;
842 pag->pagl_pagino = pagino;
843 goto newino;
844 }
845
846 /* figure out the closer block if both are valid. */
847 if (!doneleft && !doneright) {
848 useleft = pagino -
849 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
850 rec.ir_startino - pagino;
851 } else {
852 useleft = !doneleft;
853 }
854
855 /* free inodes to the left? */
856 if (useleft && trec.ir_freecount) {
857 rec = trec;
858 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
859 cur = tcur;
860
861 pag->pagl_leftrec = trec.ir_startino;
862 pag->pagl_rightrec = rec.ir_startino;
863 pag->pagl_pagino = pagino;
864 goto alloc_inode;
865 }
866
867 /* free inodes to the right? */
868 if (!useleft && rec.ir_freecount) {
869 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
870
871 pag->pagl_leftrec = trec.ir_startino;
872 pag->pagl_rightrec = rec.ir_startino;
873 pag->pagl_pagino = pagino;
874 goto alloc_inode;
875 }
876
877 /* get next record to check */
878 if (useleft) {
879 error = xfs_ialloc_next_rec(tcur, &trec,
880 &doneleft, 1);
881 } else {
882 error = xfs_ialloc_next_rec(cur, &rec,
883 &doneright, 0);
884 }
885 if (error)
886 goto error1;
887 }
888
889 /*
890 * We've reached the end of the btree. because
891 * we are only searching a small chunk of the
892 * btree each search, there is obviously free
893 * inodes closer to the parent inode than we
894 * are now. restart the search again.
895 */
896 pag->pagl_pagino = NULLAGINO;
897 pag->pagl_leftrec = NULLAGINO;
898 pag->pagl_rightrec = NULLAGINO;
899 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
900 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
901 goto restart_pagno;
902 }
903
904 /*
905 * In a different AG from the parent.
906 * See if the most recently allocated block has any free.
907 */
908newino:
909 if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
910 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
911 XFS_LOOKUP_EQ, &i);
912 if (error)
913 goto error0;
914
915 if (i == 1) {
916 error = xfs_inobt_get_rec(cur, &rec, &j);
917 if (error)
918 goto error0;
919
920 if (j == 1 && rec.ir_freecount > 0) {
921 /*
922 * The last chunk allocated in the group
923 * still has a free inode.
924 */
925 goto alloc_inode;
926 }
927 }
928 }
929
930 /*
931 * None left in the last group, search the whole AG
932 */
933 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
934 if (error)
935 goto error0;
936 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
937
938 for (;;) {
939 error = xfs_inobt_get_rec(cur, &rec, &i);
940 if (error)
941 goto error0;
942 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
943 if (rec.ir_freecount > 0)
944 break;
945 error = xfs_btree_increment(cur, 0, &i);
946 if (error)
947 goto error0;
948 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
949 }
950
951alloc_inode:
952 offset = xfs_lowbit64(rec.ir_free);
953 ASSERT(offset >= 0);
954 ASSERT(offset < XFS_INODES_PER_CHUNK);
955 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
956 XFS_INODES_PER_CHUNK) == 0);
957 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
958 rec.ir_free &= ~XFS_INOBT_MASK(offset);
959 rec.ir_freecount--;
960 error = xfs_inobt_update(cur, &rec);
961 if (error)
962 goto error0;
963 be32_add_cpu(&agi->agi_freecount, -1);
964 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
965 pag->pagi_freecount--;
966
967 error = xfs_check_agi_freecount(cur, agi);
968 if (error)
969 goto error0;
970
971 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
972 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
973 xfs_perag_put(pag);
974 *inop = ino;
975 return 0;
976error1:
977 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
978error0:
979 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
980 xfs_perag_put(pag);
981 return error;
982}
983
984/*
985 * Use the free inode btree to allocate an inode based on distance from the
986 * parent. Note that the provided cursor may be deleted and replaced.
987 */
988STATIC int
989xfs_dialloc_ag_finobt_near(
990 xfs_agino_t pagino,
991 struct xfs_btree_cur **ocur,
992 struct xfs_inobt_rec_incore *rec)
993{
994 struct xfs_btree_cur *lcur = *ocur; /* left search cursor */
995 struct xfs_btree_cur *rcur; /* right search cursor */
996 struct xfs_inobt_rec_incore rrec;
997 int error;
998 int i, j;
999
1000 error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
1001 if (error)
1002 return error;
1003
1004 if (i == 1) {
1005 error = xfs_inobt_get_rec(lcur, rec, &i);
1006 if (error)
1007 return error;
1008 XFS_WANT_CORRUPTED_RETURN(i == 1);
1009
1010 /*
1011 * See if we've landed in the parent inode record. The finobt
1012 * only tracks chunks with at least one free inode, so record
1013 * existence is enough.
1014 */
1015 if (pagino >= rec->ir_startino &&
1016 pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
1017 return 0;
1018 }
1019
1020 error = xfs_btree_dup_cursor(lcur, &rcur);
1021 if (error)
1022 return error;
1023
1024 error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
1025 if (error)
1026 goto error_rcur;
1027 if (j == 1) {
1028 error = xfs_inobt_get_rec(rcur, &rrec, &j);
1029 if (error)
1030 goto error_rcur;
1031 XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
1032 }
1033
1034 XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
1035 if (i == 1 && j == 1) {
1036 /*
1037 * Both the left and right records are valid. Choose the closer
1038 * inode chunk to the target.
1039 */
1040 if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
1041 (rrec.ir_startino - pagino)) {
1042 *rec = rrec;
1043 xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1044 *ocur = rcur;
1045 } else {
1046 xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1047 }
1048 } else if (j == 1) {
1049 /* only the right record is valid */
1050 *rec = rrec;
1051 xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1052 *ocur = rcur;
1053 } else if (i == 1) {
1054 /* only the left record is valid */
1055 xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1056 }
1057
1058 return 0;
1059
1060error_rcur:
1061 xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
1062 return error;
1063}
1064
1065/*
1066 * Use the free inode btree to find a free inode based on a newino hint. If
1067 * the hint is NULL, find the first free inode in the AG.
1068 */
1069STATIC int
1070xfs_dialloc_ag_finobt_newino(
1071 struct xfs_agi *agi,
1072 struct xfs_btree_cur *cur,
1073 struct xfs_inobt_rec_incore *rec)
1074{
1075 int error;
1076 int i;
1077
1078 if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1079 error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
1080 &i);
1081 if (error)
1082 return error;
1083 if (i == 1) {
1084 error = xfs_inobt_get_rec(cur, rec, &i);
1085 if (error)
1086 return error;
1087 XFS_WANT_CORRUPTED_RETURN(i == 1);
1088
1089 return 0;
1090 }
1091 }
1092
1093 /*
1094 * Find the first inode available in the AG.
1095 */
1096 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1097 if (error)
1098 return error;
1099 XFS_WANT_CORRUPTED_RETURN(i == 1);
1100
1101 error = xfs_inobt_get_rec(cur, rec, &i);
1102 if (error)
1103 return error;
1104 XFS_WANT_CORRUPTED_RETURN(i == 1);
1105
1106 return 0;
1107}
1108
1109/*
1110 * Update the inobt based on a modification made to the finobt. Also ensure that
1111 * the records from both trees are equivalent post-modification.
1112 */
1113STATIC int
1114xfs_dialloc_ag_update_inobt(
1115 struct xfs_btree_cur *cur, /* inobt cursor */
1116 struct xfs_inobt_rec_incore *frec, /* finobt record */
1117 int offset) /* inode offset */
1118{
1119 struct xfs_inobt_rec_incore rec;
1120 int error;
1121 int i;
1122
1123 error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1124 if (error)
1125 return error;
1126 XFS_WANT_CORRUPTED_RETURN(i == 1);
1127
1128 error = xfs_inobt_get_rec(cur, &rec, &i);
1129 if (error)
1130 return error;
1131 XFS_WANT_CORRUPTED_RETURN(i == 1);
1132 ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1133 XFS_INODES_PER_CHUNK) == 0);
1134
1135 rec.ir_free &= ~XFS_INOBT_MASK(offset);
1136 rec.ir_freecount--;
1137
1138 XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
1139 (rec.ir_freecount == frec->ir_freecount));
1140
1141 error = xfs_inobt_update(cur, &rec);
1142 if (error)
1143 return error;
1144
1145 return 0;
1146}
1147
1148/*
1149 * Allocate an inode using the free inode btree, if available. Otherwise, fall
1150 * back to the inobt search algorithm.
1151 *
1152 * The caller selected an AG for us, and made sure that free inodes are
1153 * available.
1154 */
1155STATIC int
1156xfs_dialloc_ag(
1157 struct xfs_trans *tp,
1158 struct xfs_buf *agbp,
1159 xfs_ino_t parent,
1160 xfs_ino_t *inop)
1161{
1162 struct xfs_mount *mp = tp->t_mountp;
1163 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
1164 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
1165 xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
1166 xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
1167 struct xfs_perag *pag;
1168 struct xfs_btree_cur *cur; /* finobt cursor */
1169 struct xfs_btree_cur *icur; /* inobt cursor */
1170 struct xfs_inobt_rec_incore rec;
1171 xfs_ino_t ino;
1172 int error;
1173 int offset;
1174 int i;
1175
1176 if (!xfs_sb_version_hasfinobt(&mp->m_sb))
1177 return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
1178
1179 pag = xfs_perag_get(mp, agno);
1180
1181 /*
1182 * If pagino is 0 (this is the root inode allocation) use newino.
1183 * This must work because we've just allocated some.
1184 */
1185 if (!pagino)
1186 pagino = be32_to_cpu(agi->agi_newino);
1187
1188 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
1189
1190 error = xfs_check_agi_freecount(cur, agi);
1191 if (error)
1192 goto error_cur;
1193
1194 /*
1195 * The search algorithm depends on whether we're in the same AG as the
1196 * parent. If so, find the closest available inode to the parent. If
1197 * not, consider the agi hint or find the first free inode in the AG.
1198 */
1199 if (agno == pagno)
1200 error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
1201 else
1202 error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
1203 if (error)
1204 goto error_cur;
1205
1206 offset = xfs_lowbit64(rec.ir_free);
1207 ASSERT(offset >= 0);
1208 ASSERT(offset < XFS_INODES_PER_CHUNK);
1209 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1210 XFS_INODES_PER_CHUNK) == 0);
1211 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
1212
1213 /*
1214 * Modify or remove the finobt record.
1215 */
1216 rec.ir_free &= ~XFS_INOBT_MASK(offset);
1217 rec.ir_freecount--;
1218 if (rec.ir_freecount)
1219 error = xfs_inobt_update(cur, &rec);
1220 else
1221 error = xfs_btree_delete(cur, &i);
1222 if (error)
1223 goto error_cur;
1224
1225 /*
1226 * The finobt has now been updated appropriately. We haven't updated the
1227 * agi and superblock yet, so we can create an inobt cursor and validate
1228 * the original freecount. If all is well, make the equivalent update to
1229 * the inobt using the finobt record and offset information.
1230 */
1231 icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
1232
1233 error = xfs_check_agi_freecount(icur, agi);
1234 if (error)
1235 goto error_icur;
1236
1237 error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
1238 if (error)
1239 goto error_icur;
1240
1241 /*
1242 * Both trees have now been updated. We must update the perag and
1243 * superblock before we can check the freecount for each btree.
1244 */
1245 be32_add_cpu(&agi->agi_freecount, -1);
1246 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1247 pag->pagi_freecount--;
1248
1249 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1250
1251 error = xfs_check_agi_freecount(icur, agi);
1252 if (error)
1253 goto error_icur;
1254 error = xfs_check_agi_freecount(cur, agi);
1255 if (error)
1256 goto error_icur;
1257
1258 xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
1259 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1260 xfs_perag_put(pag);
1261 *inop = ino;
1262 return 0;
1263
1264error_icur:
1265 xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
1266error_cur:
1267 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1268 xfs_perag_put(pag);
1269 return error;
1270}
1271
1272/*
1273 * Allocate an inode on disk.
1274 *
1275 * Mode is used to tell whether the new inode will need space, and whether it
1276 * is a directory.
1277 *
1278 * This function is designed to be called twice if it has to do an allocation
1279 * to make more free inodes. On the first call, *IO_agbp should be set to NULL.
1280 * If an inode is available without having to performn an allocation, an inode
1281 * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
1282 * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
1283 * The caller should then commit the current transaction, allocate a
1284 * new transaction, and call xfs_dialloc() again, passing in the previous value
1285 * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
1286 * buffer is locked across the two calls, the second call is guaranteed to have
1287 * a free inode available.
1288 *
1289 * Once we successfully pick an inode its number is returned and the on-disk
1290 * data structures are updated. The inode itself is not read in, since doing so
1291 * would break ordering constraints with xfs_reclaim.
1292 */
1293int
1294xfs_dialloc(
1295 struct xfs_trans *tp,
1296 xfs_ino_t parent,
1297 umode_t mode,
1298 int okalloc,
1299 struct xfs_buf **IO_agbp,
1300 xfs_ino_t *inop)
1301{
1302 struct xfs_mount *mp = tp->t_mountp;
1303 struct xfs_buf *agbp;
1304 xfs_agnumber_t agno;
1305 int error;
1306 int ialloced;
1307 int noroom = 0;
1308 xfs_agnumber_t start_agno;
1309 struct xfs_perag *pag;
1310
1311 if (*IO_agbp) {
1312 /*
1313 * If the caller passes in a pointer to the AGI buffer,
1314 * continue where we left off before. In this case, we
1315 * know that the allocation group has free inodes.
1316 */
1317 agbp = *IO_agbp;
1318 goto out_alloc;
1319 }
1320
1321 /*
1322 * We do not have an agbp, so select an initial allocation
1323 * group for inode allocation.
1324 */
1325 start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
1326 if (start_agno == NULLAGNUMBER) {
1327 *inop = NULLFSINO;
1328 return 0;
1329 }
1330
1331 /*
1332 * If we have already hit the ceiling of inode blocks then clear
1333 * okalloc so we scan all available agi structures for a free
1334 * inode.
1335 */
1336 if (mp->m_maxicount &&
1337 mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
1338 noroom = 1;
1339 okalloc = 0;
1340 }
1341
1342 /*
1343 * Loop until we find an allocation group that either has free inodes
1344 * or in which we can allocate some inodes. Iterate through the
1345 * allocation groups upward, wrapping at the end.
1346 */
1347 agno = start_agno;
1348 for (;;) {
1349 pag = xfs_perag_get(mp, agno);
1350 if (!pag->pagi_inodeok) {
1351 xfs_ialloc_next_ag(mp);
1352 goto nextag;
1353 }
1354
1355 if (!pag->pagi_init) {
1356 error = xfs_ialloc_pagi_init(mp, tp, agno);
1357 if (error)
1358 goto out_error;
1359 }
1360
1361 /*
1362 * Do a first racy fast path check if this AG is usable.
1363 */
1364 if (!pag->pagi_freecount && !okalloc)
1365 goto nextag;
1366
1367 /*
1368 * Then read in the AGI buffer and recheck with the AGI buffer
1369 * lock held.
1370 */
1371 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1372 if (error)
1373 goto out_error;
1374
1375 if (pag->pagi_freecount) {
1376 xfs_perag_put(pag);
1377 goto out_alloc;
1378 }
1379
1380 if (!okalloc)
1381 goto nextag_relse_buffer;
1382
1383
1384 error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
1385 if (error) {
1386 xfs_trans_brelse(tp, agbp);
1387
1388 if (error != -ENOSPC)
1389 goto out_error;
1390
1391 xfs_perag_put(pag);
1392 *inop = NULLFSINO;
1393 return 0;
1394 }
1395
1396 if (ialloced) {
1397 /*
1398 * We successfully allocated some inodes, return
1399 * the current context to the caller so that it
1400 * can commit the current transaction and call
1401 * us again where we left off.
1402 */
1403 ASSERT(pag->pagi_freecount > 0);
1404 xfs_perag_put(pag);
1405
1406 *IO_agbp = agbp;
1407 *inop = NULLFSINO;
1408 return 0;
1409 }
1410
1411nextag_relse_buffer:
1412 xfs_trans_brelse(tp, agbp);
1413nextag:
1414 xfs_perag_put(pag);
1415 if (++agno == mp->m_sb.sb_agcount)
1416 agno = 0;
1417 if (agno == start_agno) {
1418 *inop = NULLFSINO;
1419 return noroom ? -ENOSPC : 0;
1420 }
1421 }
1422
1423out_alloc:
1424 *IO_agbp = NULL;
1425 return xfs_dialloc_ag(tp, agbp, parent, inop);
1426out_error:
1427 xfs_perag_put(pag);
1428 return error;
1429}
1430
1431STATIC int
1432xfs_difree_inobt(
1433 struct xfs_mount *mp,
1434 struct xfs_trans *tp,
1435 struct xfs_buf *agbp,
1436 xfs_agino_t agino,
1437 struct xfs_bmap_free *flist,
1438 int *deleted,
1439 xfs_ino_t *first_ino,
1440 struct xfs_inobt_rec_incore *orec)
1441{
1442 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
1443 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
1444 struct xfs_perag *pag;
1445 struct xfs_btree_cur *cur;
1446 struct xfs_inobt_rec_incore rec;
1447 int ilen;
1448 int error;
1449 int i;
1450 int off;
1451
1452 ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
1453 ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
1454
1455 /*
1456 * Initialize the cursor.
1457 */
1458 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
1459
1460 error = xfs_check_agi_freecount(cur, agi);
1461 if (error)
1462 goto error0;
1463
1464 /*
1465 * Look for the entry describing this inode.
1466 */
1467 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1468 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
1469 __func__, error);
1470 goto error0;
1471 }
1472 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1473 error = xfs_inobt_get_rec(cur, &rec, &i);
1474 if (error) {
1475 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1476 __func__, error);
1477 goto error0;
1478 }
1479 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1480 /*
1481 * Get the offset in the inode chunk.
1482 */
1483 off = agino - rec.ir_startino;
1484 ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
1485 ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1486 /*
1487 * Mark the inode free & increment the count.
1488 */
1489 rec.ir_free |= XFS_INOBT_MASK(off);
1490 rec.ir_freecount++;
1491
1492 /*
1493 * When an inode cluster is free, it becomes eligible for removal
1494 */
1495 if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
1496 (rec.ir_freecount == mp->m_ialloc_inos)) {
1497
1498 *deleted = 1;
1499 *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
1500
1501 /*
1502 * Remove the inode cluster from the AGI B+Tree, adjust the
1503 * AGI and Superblock inode counts, and mark the disk space
1504 * to be freed when the transaction is committed.
1505 */
1506 ilen = mp->m_ialloc_inos;
1507 be32_add_cpu(&agi->agi_count, -ilen);
1508 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1509 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1510 pag = xfs_perag_get(mp, agno);
1511 pag->pagi_freecount -= ilen - 1;
1512 xfs_perag_put(pag);
1513 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1514 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1515
1516 if ((error = xfs_btree_delete(cur, &i))) {
1517 xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
1518 __func__, error);
1519 goto error0;
1520 }
1521
1522 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
1523 XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
1524 mp->m_ialloc_blks, flist, mp);
1525 } else {
1526 *deleted = 0;
1527
1528 error = xfs_inobt_update(cur, &rec);
1529 if (error) {
1530 xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
1531 __func__, error);
1532 goto error0;
1533 }
1534
1535 /*
1536 * Change the inode free counts and log the ag/sb changes.
1537 */
1538 be32_add_cpu(&agi->agi_freecount, 1);
1539 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1540 pag = xfs_perag_get(mp, agno);
1541 pag->pagi_freecount++;
1542 xfs_perag_put(pag);
1543 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1544 }
1545
1546 error = xfs_check_agi_freecount(cur, agi);
1547 if (error)
1548 goto error0;
1549
1550 *orec = rec;
1551 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1552 return 0;
1553
1554error0:
1555 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1556 return error;
1557}
1558
1559/*
1560 * Free an inode in the free inode btree.
1561 */
1562STATIC int
1563xfs_difree_finobt(
1564 struct xfs_mount *mp,
1565 struct xfs_trans *tp,
1566 struct xfs_buf *agbp,
1567 xfs_agino_t agino,
1568 struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
1569{
1570 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
1571 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
1572 struct xfs_btree_cur *cur;
1573 struct xfs_inobt_rec_incore rec;
1574 int offset = agino - ibtrec->ir_startino;
1575 int error;
1576 int i;
1577
1578 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
1579
1580 error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
1581 if (error)
1582 goto error;
1583 if (i == 0) {
1584 /*
1585 * If the record does not exist in the finobt, we must have just
1586 * freed an inode in a previously fully allocated chunk. If not,
1587 * something is out of sync.
1588 */
1589 XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
1590
1591 error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
1592 ibtrec->ir_free, &i);
1593 if (error)
1594 goto error;
1595 ASSERT(i == 1);
1596
1597 goto out;
1598 }
1599
1600 /*
1601 * Read and update the existing record. We could just copy the ibtrec
1602 * across here, but that would defeat the purpose of having redundant
1603 * metadata. By making the modifications independently, we can catch
1604 * corruptions that we wouldn't see if we just copied from one record
1605 * to another.
1606 */
1607 error = xfs_inobt_get_rec(cur, &rec, &i);
1608 if (error)
1609 goto error;
1610 XFS_WANT_CORRUPTED_GOTO(i == 1, error);
1611
1612 rec.ir_free |= XFS_INOBT_MASK(offset);
1613 rec.ir_freecount++;
1614
1615 XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
1616 (rec.ir_freecount == ibtrec->ir_freecount),
1617 error);
1618
1619 /*
1620 * The content of inobt records should always match between the inobt
1621 * and finobt. The lifecycle of records in the finobt is different from
1622 * the inobt in that the finobt only tracks records with at least one
1623 * free inode. Hence, if all of the inodes are free and we aren't
1624 * keeping inode chunks permanently on disk, remove the record.
1625 * Otherwise, update the record with the new information.
1626 */
1627 if (rec.ir_freecount == mp->m_ialloc_inos &&
1628 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
1629 error = xfs_btree_delete(cur, &i);
1630 if (error)
1631 goto error;
1632 ASSERT(i == 1);
1633 } else {
1634 error = xfs_inobt_update(cur, &rec);
1635 if (error)
1636 goto error;
1637 }
1638
1639out:
1640 error = xfs_check_agi_freecount(cur, agi);
1641 if (error)
1642 goto error;
1643
1644 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1645 return 0;
1646
1647error:
1648 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1649 return error;
1650}
1651
1652/*
1653 * Free disk inode. Carefully avoids touching the incore inode, all
1654 * manipulations incore are the caller's responsibility.
1655 * The on-disk inode is not changed by this operation, only the
1656 * btree (free inode mask) is changed.
1657 */
1658int
1659xfs_difree(
1660 struct xfs_trans *tp, /* transaction pointer */
1661 xfs_ino_t inode, /* inode to be freed */
1662 struct xfs_bmap_free *flist, /* extents to free */
1663 int *deleted,/* set if inode cluster was deleted */
1664 xfs_ino_t *first_ino)/* first inode in deleted cluster */
1665{
1666 /* REFERENCED */
1667 xfs_agblock_t agbno; /* block number containing inode */
1668 struct xfs_buf *agbp; /* buffer for allocation group header */
1669 xfs_agino_t agino; /* allocation group inode number */
1670 xfs_agnumber_t agno; /* allocation group number */
1671 int error; /* error return value */
1672 struct xfs_mount *mp; /* mount structure for filesystem */
1673 struct xfs_inobt_rec_incore rec;/* btree record */
1674
1675 mp = tp->t_mountp;
1676
1677 /*
1678 * Break up inode number into its components.
1679 */
1680 agno = XFS_INO_TO_AGNO(mp, inode);
1681 if (agno >= mp->m_sb.sb_agcount) {
1682 xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
1683 __func__, agno, mp->m_sb.sb_agcount);
1684 ASSERT(0);
1685 return -EINVAL;
1686 }
1687 agino = XFS_INO_TO_AGINO(mp, inode);
1688 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
1689 xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
1690 __func__, (unsigned long long)inode,
1691 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
1692 ASSERT(0);
1693 return -EINVAL;
1694 }
1695 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1696 if (agbno >= mp->m_sb.sb_agblocks) {
1697 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
1698 __func__, agbno, mp->m_sb.sb_agblocks);
1699 ASSERT(0);
1700 return -EINVAL;
1701 }
1702 /*
1703 * Get the allocation group header.
1704 */
1705 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1706 if (error) {
1707 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
1708 __func__, error);
1709 return error;
1710 }
1711
1712 /*
1713 * Fix up the inode allocation btree.
1714 */
1715 error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
1716 &rec);
1717 if (error)
1718 goto error0;
1719
1720 /*
1721 * Fix up the free inode btree.
1722 */
1723 if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
1724 error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
1725 if (error)
1726 goto error0;
1727 }
1728
1729 return 0;
1730
1731error0:
1732 return error;
1733}
1734
1735STATIC int
1736xfs_imap_lookup(
1737 struct xfs_mount *mp,
1738 struct xfs_trans *tp,
1739 xfs_agnumber_t agno,
1740 xfs_agino_t agino,
1741 xfs_agblock_t agbno,
1742 xfs_agblock_t *chunk_agbno,
1743 xfs_agblock_t *offset_agbno,
1744 int flags)
1745{
1746 struct xfs_inobt_rec_incore rec;
1747 struct xfs_btree_cur *cur;
1748 struct xfs_buf *agbp;
1749 int error;
1750 int i;
1751
1752 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1753 if (error) {
1754 xfs_alert(mp,
1755 "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
1756 __func__, error, agno);
1757 return error;
1758 }
1759
1760 /*
1761 * Lookup the inode record for the given agino. If the record cannot be
1762 * found, then it's an invalid inode number and we should abort. Once
1763 * we have a record, we need to ensure it contains the inode number
1764 * we are looking up.
1765 */
1766 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
1767 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1768 if (!error) {
1769 if (i)
1770 error = xfs_inobt_get_rec(cur, &rec, &i);
1771 if (!error && i == 0)
1772 error = -EINVAL;
1773 }
1774
1775 xfs_trans_brelse(tp, agbp);
1776 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1777 if (error)
1778 return error;
1779
1780 /* check that the returned record contains the required inode */
1781 if (rec.ir_startino > agino ||
1782 rec.ir_startino + mp->m_ialloc_inos <= agino)
1783 return -EINVAL;
1784
1785 /* for untrusted inodes check it is allocated first */
1786 if ((flags & XFS_IGET_UNTRUSTED) &&
1787 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
1788 return -EINVAL;
1789
1790 *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
1791 *offset_agbno = agbno - *chunk_agbno;
1792 return 0;
1793}
1794
1795/*
1796 * Return the location of the inode in imap, for mapping it into a buffer.
1797 */
1798int
1799xfs_imap(
1800 xfs_mount_t *mp, /* file system mount structure */
1801 xfs_trans_t *tp, /* transaction pointer */
1802 xfs_ino_t ino, /* inode to locate */
1803 struct xfs_imap *imap, /* location map structure */
1804 uint flags) /* flags for inode btree lookup */
1805{
1806 xfs_agblock_t agbno; /* block number of inode in the alloc group */
1807 xfs_agino_t agino; /* inode number within alloc group */
1808 xfs_agnumber_t agno; /* allocation group number */
1809 int blks_per_cluster; /* num blocks per inode cluster */
1810 xfs_agblock_t chunk_agbno; /* first block in inode chunk */
1811 xfs_agblock_t cluster_agbno; /* first block in inode cluster */
1812 int error; /* error code */
1813 int offset; /* index of inode in its buffer */
1814 xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
1815
1816 ASSERT(ino != NULLFSINO);
1817
1818 /*
1819 * Split up the inode number into its parts.
1820 */
1821 agno = XFS_INO_TO_AGNO(mp, ino);
1822 agino = XFS_INO_TO_AGINO(mp, ino);
1823 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1824 if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
1825 ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1826#ifdef DEBUG
1827 /*
1828 * Don't output diagnostic information for untrusted inodes
1829 * as they can be invalid without implying corruption.
1830 */
1831 if (flags & XFS_IGET_UNTRUSTED)
1832 return -EINVAL;
1833 if (agno >= mp->m_sb.sb_agcount) {
1834 xfs_alert(mp,
1835 "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
1836 __func__, agno, mp->m_sb.sb_agcount);
1837 }
1838 if (agbno >= mp->m_sb.sb_agblocks) {
1839 xfs_alert(mp,
1840 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
1841 __func__, (unsigned long long)agbno,
1842 (unsigned long)mp->m_sb.sb_agblocks);
1843 }
1844 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1845 xfs_alert(mp,
1846 "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
1847 __func__, ino,
1848 XFS_AGINO_TO_INO(mp, agno, agino));
1849 }
1850 xfs_stack_trace();
1851#endif /* DEBUG */
1852 return -EINVAL;
1853 }
1854
1855 blks_per_cluster = xfs_icluster_size_fsb(mp);
1856
1857 /*
1858 * For bulkstat and handle lookups, we have an untrusted inode number
1859 * that we have to verify is valid. We cannot do this just by reading
1860 * the inode buffer as it may have been unlinked and removed leaving
1861 * inodes in stale state on disk. Hence we have to do a btree lookup
1862 * in all cases where an untrusted inode number is passed.
1863 */
1864 if (flags & XFS_IGET_UNTRUSTED) {
1865 error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
1866 &chunk_agbno, &offset_agbno, flags);
1867 if (error)
1868 return error;
1869 goto out_map;
1870 }
1871
1872 /*
1873 * If the inode cluster size is the same as the blocksize or
1874 * smaller we get to the buffer by simple arithmetics.
1875 */
1876 if (blks_per_cluster == 1) {
1877 offset = XFS_INO_TO_OFFSET(mp, ino);
1878 ASSERT(offset < mp->m_sb.sb_inopblock);
1879
1880 imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
1881 imap->im_len = XFS_FSB_TO_BB(mp, 1);
1882 imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
1883 return 0;
1884 }
1885
1886 /*
1887 * If the inode chunks are aligned then use simple maths to
1888 * find the location. Otherwise we have to do a btree
1889 * lookup to find the location.
1890 */
1891 if (mp->m_inoalign_mask) {
1892 offset_agbno = agbno & mp->m_inoalign_mask;
1893 chunk_agbno = agbno - offset_agbno;
1894 } else {
1895 error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
1896 &chunk_agbno, &offset_agbno, flags);
1897 if (error)
1898 return error;
1899 }
1900
1901out_map:
1902 ASSERT(agbno >= chunk_agbno);
1903 cluster_agbno = chunk_agbno +
1904 ((offset_agbno / blks_per_cluster) * blks_per_cluster);
1905 offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
1906 XFS_INO_TO_OFFSET(mp, ino);
1907
1908 imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
1909 imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
1910 imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
1911
1912 /*
1913 * If the inode number maps to a block outside the bounds
1914 * of the file system then return NULL rather than calling
1915 * read_buf and panicing when we get an error from the
1916 * driver.
1917 */
1918 if ((imap->im_blkno + imap->im_len) >
1919 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
1920 xfs_alert(mp,
1921 "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
1922 __func__, (unsigned long long) imap->im_blkno,
1923 (unsigned long long) imap->im_len,
1924 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1925 return -EINVAL;
1926 }
1927 return 0;
1928}
1929
1930/*
1931 * Compute and fill in value of m_in_maxlevels.
1932 */
1933void
1934xfs_ialloc_compute_maxlevels(
1935 xfs_mount_t *mp) /* file system mount structure */
1936{
1937 int level;
1938 uint maxblocks;
1939 uint maxleafents;
1940 int minleafrecs;
1941 int minnoderecs;
1942
1943 maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
1944 XFS_INODES_PER_CHUNK_LOG;
1945 minleafrecs = mp->m_alloc_mnr[0];
1946 minnoderecs = mp->m_alloc_mnr[1];
1947 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
1948 for (level = 1; maxblocks > 1; level++)
1949 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
1950 mp->m_in_maxlevels = level;
1951}
1952
1953/*
1954 * Log specified fields for the ag hdr (inode section). The growth of the agi
1955 * structure over time requires that we interpret the buffer as two logical
1956 * regions delineated by the end of the unlinked list. This is due to the size
1957 * of the hash table and its location in the middle of the agi.
1958 *
1959 * For example, a request to log a field before agi_unlinked and a field after
1960 * agi_unlinked could cause us to log the entire hash table and use an excessive
1961 * amount of log space. To avoid this behavior, log the region up through
1962 * agi_unlinked in one call and the region after agi_unlinked through the end of
1963 * the structure in another.
1964 */
1965void
1966xfs_ialloc_log_agi(
1967 xfs_trans_t *tp, /* transaction pointer */
1968 xfs_buf_t *bp, /* allocation group header buffer */
1969 int fields) /* bitmask of fields to log */
1970{
1971 int first; /* first byte number */
1972 int last; /* last byte number */
1973 static const short offsets[] = { /* field starting offsets */
1974 /* keep in sync with bit definitions */
1975 offsetof(xfs_agi_t, agi_magicnum),
1976 offsetof(xfs_agi_t, agi_versionnum),
1977 offsetof(xfs_agi_t, agi_seqno),
1978 offsetof(xfs_agi_t, agi_length),
1979 offsetof(xfs_agi_t, agi_count),
1980 offsetof(xfs_agi_t, agi_root),
1981 offsetof(xfs_agi_t, agi_level),
1982 offsetof(xfs_agi_t, agi_freecount),
1983 offsetof(xfs_agi_t, agi_newino),
1984 offsetof(xfs_agi_t, agi_dirino),
1985 offsetof(xfs_agi_t, agi_unlinked),
1986 offsetof(xfs_agi_t, agi_free_root),
1987 offsetof(xfs_agi_t, agi_free_level),
1988 sizeof(xfs_agi_t)
1989 };
1990#ifdef DEBUG
1991 xfs_agi_t *agi; /* allocation group header */
1992
1993 agi = XFS_BUF_TO_AGI(bp);
1994 ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
1995#endif
1996
1997 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
1998
1999 /*
2000 * Compute byte offsets for the first and last fields in the first
2001 * region and log the agi buffer. This only logs up through
2002 * agi_unlinked.
2003 */
2004 if (fields & XFS_AGI_ALL_BITS_R1) {
2005 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
2006 &first, &last);
2007 xfs_trans_log_buf(tp, bp, first, last);
2008 }
2009
2010 /*
2011 * Mask off the bits in the first region and calculate the first and
2012 * last field offsets for any bits in the second region.
2013 */
2014 fields &= ~XFS_AGI_ALL_BITS_R1;
2015 if (fields) {
2016 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
2017 &first, &last);
2018 xfs_trans_log_buf(tp, bp, first, last);
2019 }
2020}
2021
2022#ifdef DEBUG
2023STATIC void
2024xfs_check_agi_unlinked(
2025 struct xfs_agi *agi)
2026{
2027 int i;
2028
2029 for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
2030 ASSERT(agi->agi_unlinked[i]);
2031}
2032#else
2033#define xfs_check_agi_unlinked(agi)
2034#endif
2035
2036static bool
2037xfs_agi_verify(
2038 struct xfs_buf *bp)
2039{
2040 struct xfs_mount *mp = bp->b_target->bt_mount;
2041 struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
2042
2043 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2044 !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
2045 return false;
2046 /*
2047 * Validate the magic number of the agi block.
2048 */
2049 if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
2050 return false;
2051 if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
2052 return false;
2053
2054 /*
2055 * during growfs operations, the perag is not fully initialised,
2056 * so we can't use it for any useful checking. growfs ensures we can't
2057 * use it by using uncached buffers that don't have the perag attached
2058 * so we can detect and avoid this problem.
2059 */
2060 if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
2061 return false;
2062
2063 xfs_check_agi_unlinked(agi);
2064 return true;
2065}
2066
2067static void
2068xfs_agi_read_verify(
2069 struct xfs_buf *bp)
2070{
2071 struct xfs_mount *mp = bp->b_target->bt_mount;
2072
2073 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2074 !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
2075 xfs_buf_ioerror(bp, -EFSBADCRC);
2076 else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
2077 XFS_ERRTAG_IALLOC_READ_AGI,
2078 XFS_RANDOM_IALLOC_READ_AGI))
2079 xfs_buf_ioerror(bp, -EFSCORRUPTED);
2080
2081 if (bp->b_error)
2082 xfs_verifier_error(bp);
2083}
2084
2085static void
2086xfs_agi_write_verify(
2087 struct xfs_buf *bp)
2088{
2089 struct xfs_mount *mp = bp->b_target->bt_mount;
2090 struct xfs_buf_log_item *bip = bp->b_fspriv;
2091
2092 if (!xfs_agi_verify(bp)) {
2093 xfs_buf_ioerror(bp, -EFSCORRUPTED);
2094 xfs_verifier_error(bp);
2095 return;
2096 }
2097
2098 if (!xfs_sb_version_hascrc(&mp->m_sb))
2099 return;
2100
2101 if (bip)
2102 XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2103 xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
2104}
2105
2106const struct xfs_buf_ops xfs_agi_buf_ops = {
2107 .verify_read = xfs_agi_read_verify,
2108 .verify_write = xfs_agi_write_verify,
2109};
2110
2111/*
2112 * Read in the allocation group header (inode allocation section)
2113 */
2114int
2115xfs_read_agi(
2116 struct xfs_mount *mp, /* file system mount structure */
2117 struct xfs_trans *tp, /* transaction pointer */
2118 xfs_agnumber_t agno, /* allocation group number */
2119 struct xfs_buf **bpp) /* allocation group hdr buf */
2120{
2121 int error;
2122
2123 trace_xfs_read_agi(mp, agno);
2124
2125 ASSERT(agno != NULLAGNUMBER);
2126 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2127 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
2128 XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
2129 if (error)
2130 return error;
2131
2132 xfs_buf_set_ref(*bpp, XFS_AGI_REF);
2133 return 0;
2134}
2135
2136int
2137xfs_ialloc_read_agi(
2138 struct xfs_mount *mp, /* file system mount structure */
2139 struct xfs_trans *tp, /* transaction pointer */
2140 xfs_agnumber_t agno, /* allocation group number */
2141 struct xfs_buf **bpp) /* allocation group hdr buf */
2142{
2143 struct xfs_agi *agi; /* allocation group header */
2144 struct xfs_perag *pag; /* per allocation group data */
2145 int error;
2146
2147 trace_xfs_ialloc_read_agi(mp, agno);
2148
2149 error = xfs_read_agi(mp, tp, agno, bpp);
2150 if (error)
2151 return error;
2152
2153 agi = XFS_BUF_TO_AGI(*bpp);
2154 pag = xfs_perag_get(mp, agno);
2155 if (!pag->pagi_init) {
2156 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
2157 pag->pagi_count = be32_to_cpu(agi->agi_count);
2158 pag->pagi_init = 1;
2159 }
2160
2161 /*
2162 * It's possible for these to be out of sync if
2163 * we are in the middle of a forced shutdown.
2164 */
2165 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
2166 XFS_FORCED_SHUTDOWN(mp));
2167 xfs_perag_put(pag);
2168 return 0;
2169}
2170
2171/*
2172 * Read in the agi to initialise the per-ag data in the mount structure
2173 */
2174int
2175xfs_ialloc_pagi_init(
2176 xfs_mount_t *mp, /* file system mount structure */
2177 xfs_trans_t *tp, /* transaction pointer */
2178 xfs_agnumber_t agno) /* allocation group number */
2179{
2180 xfs_buf_t *bp = NULL;
2181 int error;
2182
2183 error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
2184 if (error)
2185 return error;
2186 if (bp)
2187 xfs_trans_brelse(tp, bp);
2188 return 0;
2189}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
new file mode 100644
index 000000000000..95ad1c002d60
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -0,0 +1,163 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_IALLOC_H__
19#define __XFS_IALLOC_H__
20
21struct xfs_buf;
22struct xfs_dinode;
23struct xfs_imap;
24struct xfs_mount;
25struct xfs_trans;
26struct xfs_btree_cur;
27
28/* Move inodes in clusters of this size */
29#define XFS_INODE_BIG_CLUSTER_SIZE 8192
30
31/* Calculate and return the number of filesystem blocks per inode cluster */
32static inline int
33xfs_icluster_size_fsb(
34 struct xfs_mount *mp)
35{
36 if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
37 return 1;
38 return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
39}
40
41/*
42 * Make an inode pointer out of the buffer/offset.
43 */
44static inline struct xfs_dinode *
45xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
46{
47 return (struct xfs_dinode *)
48 (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
49}
50
51/*
52 * Allocate an inode on disk.
53 * Mode is used to tell whether the new inode will need space, and whether
54 * it is a directory.
55 *
56 * To work within the constraint of one allocation per transaction,
57 * xfs_dialloc() is designed to be called twice if it has to do an
58 * allocation to make more free inodes. If an inode is
59 * available without an allocation, agbp would be set to the current
60 * agbp and alloc_done set to false.
61 * If an allocation needed to be done, agbp would be set to the
62 * inode header of the allocation group and alloc_done set to true.
63 * The caller should then commit the current transaction and allocate a new
64 * transaction. xfs_dialloc() should then be called again with
65 * the agbp value returned from the previous call.
66 *
67 * Once we successfully pick an inode its number is returned and the
68 * on-disk data structures are updated. The inode itself is not read
69 * in, since doing so would break ordering constraints with xfs_reclaim.
70 *
71 * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
72 */
73int /* error */
74xfs_dialloc(
75 struct xfs_trans *tp, /* transaction pointer */
76 xfs_ino_t parent, /* parent inode (directory) */
77 umode_t mode, /* mode bits for new inode */
78 int okalloc, /* ok to allocate more space */
79 struct xfs_buf **agbp, /* buf for a.g. inode header */
80 xfs_ino_t *inop); /* inode number allocated */
81
82/*
83 * Free disk inode. Carefully avoids touching the incore inode, all
84 * manipulations incore are the caller's responsibility.
85 * The on-disk inode is not changed by this operation, only the
86 * btree (free inode mask) is changed.
87 */
88int /* error */
89xfs_difree(
90 struct xfs_trans *tp, /* transaction pointer */
91 xfs_ino_t inode, /* inode to be freed */
92 struct xfs_bmap_free *flist, /* extents to free */
93 int *deleted, /* set if inode cluster was deleted */
94 xfs_ino_t *first_ino); /* first inode in deleted cluster */
95
96/*
97 * Return the location of the inode in imap, for mapping it into a buffer.
98 */
99int
100xfs_imap(
101 struct xfs_mount *mp, /* file system mount structure */
102 struct xfs_trans *tp, /* transaction pointer */
103 xfs_ino_t ino, /* inode to locate */
104 struct xfs_imap *imap, /* location map structure */
105 uint flags); /* flags for inode btree lookup */
106
107/*
108 * Compute and fill in value of m_in_maxlevels.
109 */
110void
111xfs_ialloc_compute_maxlevels(
112 struct xfs_mount *mp); /* file system mount structure */
113
114/*
115 * Log specified fields for the ag hdr (inode section)
116 */
117void
118xfs_ialloc_log_agi(
119 struct xfs_trans *tp, /* transaction pointer */
120 struct xfs_buf *bp, /* allocation group header buffer */
121 int fields); /* bitmask of fields to log */
122
123/*
124 * Read in the allocation group header (inode allocation section)
125 */
126int /* error */
127xfs_ialloc_read_agi(
128 struct xfs_mount *mp, /* file system mount structure */
129 struct xfs_trans *tp, /* transaction pointer */
130 xfs_agnumber_t agno, /* allocation group number */
131 struct xfs_buf **bpp); /* allocation group hdr buf */
132
133/*
134 * Read in the allocation group header to initialise the per-ag data
135 * in the mount structure
136 */
137int
138xfs_ialloc_pagi_init(
139 struct xfs_mount *mp, /* file system mount structure */
140 struct xfs_trans *tp, /* transaction pointer */
141 xfs_agnumber_t agno); /* allocation group number */
142
143/*
144 * Lookup a record by ino in the btree given by cur.
145 */
146int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
147 xfs_lookup_t dir, int *stat);
148
149/*
150 * Get the data from the pointed-to record.
151 */
152int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
153 xfs_inobt_rec_incore_t *rec, int *stat);
154
155/*
156 * Inode chunk initialisation routine
157 */
158int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
159 struct list_head *buffer_list,
160 xfs_agnumber_t agno, xfs_agblock_t agbno,
161 xfs_agblock_t length, unsigned int gen);
162
163#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
new file mode 100644
index 000000000000..c9b06f30fe86
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -0,0 +1,422 @@
1/*
2 * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_inode.h"
29#include "xfs_btree.h"
30#include "xfs_ialloc.h"
31#include "xfs_ialloc_btree.h"
32#include "xfs_alloc.h"
33#include "xfs_error.h"
34#include "xfs_trace.h"
35#include "xfs_cksum.h"
36#include "xfs_trans.h"
37
38
39STATIC int
40xfs_inobt_get_minrecs(
41 struct xfs_btree_cur *cur,
42 int level)
43{
44 return cur->bc_mp->m_inobt_mnr[level != 0];
45}
46
47STATIC struct xfs_btree_cur *
48xfs_inobt_dup_cursor(
49 struct xfs_btree_cur *cur)
50{
51 return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
52 cur->bc_private.a.agbp, cur->bc_private.a.agno,
53 cur->bc_btnum);
54}
55
56STATIC void
57xfs_inobt_set_root(
58 struct xfs_btree_cur *cur,
59 union xfs_btree_ptr *nptr,
60 int inc) /* level change */
61{
62 struct xfs_buf *agbp = cur->bc_private.a.agbp;
63 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
64
65 agi->agi_root = nptr->s;
66 be32_add_cpu(&agi->agi_level, inc);
67 xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
68}
69
70STATIC void
71xfs_finobt_set_root(
72 struct xfs_btree_cur *cur,
73 union xfs_btree_ptr *nptr,
74 int inc) /* level change */
75{
76 struct xfs_buf *agbp = cur->bc_private.a.agbp;
77 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
78
79 agi->agi_free_root = nptr->s;
80 be32_add_cpu(&agi->agi_free_level, inc);
81 xfs_ialloc_log_agi(cur->bc_tp, agbp,
82 XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
83}
84
85STATIC int
86xfs_inobt_alloc_block(
87 struct xfs_btree_cur *cur,
88 union xfs_btree_ptr *start,
89 union xfs_btree_ptr *new,
90 int *stat)
91{
92 xfs_alloc_arg_t args; /* block allocation args */
93 int error; /* error return value */
94 xfs_agblock_t sbno = be32_to_cpu(start->s);
95
96 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
97
98 memset(&args, 0, sizeof(args));
99 args.tp = cur->bc_tp;
100 args.mp = cur->bc_mp;
101 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
102 args.minlen = 1;
103 args.maxlen = 1;
104 args.prod = 1;
105 args.type = XFS_ALLOCTYPE_NEAR_BNO;
106
107 error = xfs_alloc_vextent(&args);
108 if (error) {
109 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
110 return error;
111 }
112 if (args.fsbno == NULLFSBLOCK) {
113 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
114 *stat = 0;
115 return 0;
116 }
117 ASSERT(args.len == 1);
118 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
119
120 new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
121 *stat = 1;
122 return 0;
123}
124
125STATIC int
126xfs_inobt_free_block(
127 struct xfs_btree_cur *cur,
128 struct xfs_buf *bp)
129{
130 xfs_fsblock_t fsbno;
131 int error;
132
133 fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
134 error = xfs_free_extent(cur->bc_tp, fsbno, 1);
135 if (error)
136 return error;
137
138 xfs_trans_binval(cur->bc_tp, bp);
139 return error;
140}
141
142STATIC int
143xfs_inobt_get_maxrecs(
144 struct xfs_btree_cur *cur,
145 int level)
146{
147 return cur->bc_mp->m_inobt_mxr[level != 0];
148}
149
150STATIC void
151xfs_inobt_init_key_from_rec(
152 union xfs_btree_key *key,
153 union xfs_btree_rec *rec)
154{
155 key->inobt.ir_startino = rec->inobt.ir_startino;
156}
157
158STATIC void
159xfs_inobt_init_rec_from_key(
160 union xfs_btree_key *key,
161 union xfs_btree_rec *rec)
162{
163 rec->inobt.ir_startino = key->inobt.ir_startino;
164}
165
166STATIC void
167xfs_inobt_init_rec_from_cur(
168 struct xfs_btree_cur *cur,
169 union xfs_btree_rec *rec)
170{
171 rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
172 rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
173 rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
174}
175
176/*
177 * initial value of ptr for lookup
178 */
179STATIC void
180xfs_inobt_init_ptr_from_cur(
181 struct xfs_btree_cur *cur,
182 union xfs_btree_ptr *ptr)
183{
184 struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
185
186 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
187
188 ptr->s = agi->agi_root;
189}
190
191STATIC void
192xfs_finobt_init_ptr_from_cur(
193 struct xfs_btree_cur *cur,
194 union xfs_btree_ptr *ptr)
195{
196 struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
197
198 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
199 ptr->s = agi->agi_free_root;
200}
201
202STATIC __int64_t
203xfs_inobt_key_diff(
204 struct xfs_btree_cur *cur,
205 union xfs_btree_key *key)
206{
207 return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
208 cur->bc_rec.i.ir_startino;
209}
210
211static int
212xfs_inobt_verify(
213 struct xfs_buf *bp)
214{
215 struct xfs_mount *mp = bp->b_target->bt_mount;
216 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
217 struct xfs_perag *pag = bp->b_pag;
218 unsigned int level;
219
220 /*
221 * During growfs operations, we can't verify the exact owner as the
222 * perag is not fully initialised and hence not attached to the buffer.
223 *
224 * Similarly, during log recovery we will have a perag structure
225 * attached, but the agi information will not yet have been initialised
226 * from the on disk AGI. We don't currently use any of this information,
227 * but beware of the landmine (i.e. need to check pag->pagi_init) if we
228 * ever do.
229 */
230 switch (block->bb_magic) {
231 case cpu_to_be32(XFS_IBT_CRC_MAGIC):
232 case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
233 if (!xfs_sb_version_hascrc(&mp->m_sb))
234 return false;
235 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
236 return false;
237 if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
238 return false;
239 if (pag &&
240 be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
241 return false;
242 /* fall through */
243 case cpu_to_be32(XFS_IBT_MAGIC):
244 case cpu_to_be32(XFS_FIBT_MAGIC):
245 break;
246 default:
247 return 0;
248 }
249
250 /* numrecs and level verification */
251 level = be16_to_cpu(block->bb_level);
252 if (level >= mp->m_in_maxlevels)
253 return false;
254 if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
255 return false;
256
257 /* sibling pointer verification */
258 if (!block->bb_u.s.bb_leftsib ||
259 (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
260 block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
261 return false;
262 if (!block->bb_u.s.bb_rightsib ||
263 (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
264 block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
265 return false;
266
267 return true;
268}
269
270static void
271xfs_inobt_read_verify(
272 struct xfs_buf *bp)
273{
274 if (!xfs_btree_sblock_verify_crc(bp))
275 xfs_buf_ioerror(bp, -EFSBADCRC);
276 else if (!xfs_inobt_verify(bp))
277 xfs_buf_ioerror(bp, -EFSCORRUPTED);
278
279 if (bp->b_error) {
280 trace_xfs_btree_corrupt(bp, _RET_IP_);
281 xfs_verifier_error(bp);
282 }
283}
284
285static void
286xfs_inobt_write_verify(
287 struct xfs_buf *bp)
288{
289 if (!xfs_inobt_verify(bp)) {
290 trace_xfs_btree_corrupt(bp, _RET_IP_);
291 xfs_buf_ioerror(bp, -EFSCORRUPTED);
292 xfs_verifier_error(bp);
293 return;
294 }
295 xfs_btree_sblock_calc_crc(bp);
296
297}
298
299const struct xfs_buf_ops xfs_inobt_buf_ops = {
300 .verify_read = xfs_inobt_read_verify,
301 .verify_write = xfs_inobt_write_verify,
302};
303
304#if defined(DEBUG) || defined(XFS_WARN)
305STATIC int
306xfs_inobt_keys_inorder(
307 struct xfs_btree_cur *cur,
308 union xfs_btree_key *k1,
309 union xfs_btree_key *k2)
310{
311 return be32_to_cpu(k1->inobt.ir_startino) <
312 be32_to_cpu(k2->inobt.ir_startino);
313}
314
315STATIC int
316xfs_inobt_recs_inorder(
317 struct xfs_btree_cur *cur,
318 union xfs_btree_rec *r1,
319 union xfs_btree_rec *r2)
320{
321 return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
322 be32_to_cpu(r2->inobt.ir_startino);
323}
324#endif /* DEBUG */
325
326static const struct xfs_btree_ops xfs_inobt_ops = {
327 .rec_len = sizeof(xfs_inobt_rec_t),
328 .key_len = sizeof(xfs_inobt_key_t),
329
330 .dup_cursor = xfs_inobt_dup_cursor,
331 .set_root = xfs_inobt_set_root,
332 .alloc_block = xfs_inobt_alloc_block,
333 .free_block = xfs_inobt_free_block,
334 .get_minrecs = xfs_inobt_get_minrecs,
335 .get_maxrecs = xfs_inobt_get_maxrecs,
336 .init_key_from_rec = xfs_inobt_init_key_from_rec,
337 .init_rec_from_key = xfs_inobt_init_rec_from_key,
338 .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
339 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
340 .key_diff = xfs_inobt_key_diff,
341 .buf_ops = &xfs_inobt_buf_ops,
342#if defined(DEBUG) || defined(XFS_WARN)
343 .keys_inorder = xfs_inobt_keys_inorder,
344 .recs_inorder = xfs_inobt_recs_inorder,
345#endif
346};
347
348static const struct xfs_btree_ops xfs_finobt_ops = {
349 .rec_len = sizeof(xfs_inobt_rec_t),
350 .key_len = sizeof(xfs_inobt_key_t),
351
352 .dup_cursor = xfs_inobt_dup_cursor,
353 .set_root = xfs_finobt_set_root,
354 .alloc_block = xfs_inobt_alloc_block,
355 .free_block = xfs_inobt_free_block,
356 .get_minrecs = xfs_inobt_get_minrecs,
357 .get_maxrecs = xfs_inobt_get_maxrecs,
358 .init_key_from_rec = xfs_inobt_init_key_from_rec,
359 .init_rec_from_key = xfs_inobt_init_rec_from_key,
360 .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
361 .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
362 .key_diff = xfs_inobt_key_diff,
363 .buf_ops = &xfs_inobt_buf_ops,
364#if defined(DEBUG) || defined(XFS_WARN)
365 .keys_inorder = xfs_inobt_keys_inorder,
366 .recs_inorder = xfs_inobt_recs_inorder,
367#endif
368};
369
370/*
371 * Allocate a new inode btree cursor.
372 */
373struct xfs_btree_cur * /* new inode btree cursor */
374xfs_inobt_init_cursor(
375 struct xfs_mount *mp, /* file system mount point */
376 struct xfs_trans *tp, /* transaction pointer */
377 struct xfs_buf *agbp, /* buffer for agi structure */
378 xfs_agnumber_t agno, /* allocation group number */
379 xfs_btnum_t btnum) /* ialloc or free ino btree */
380{
381 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
382 struct xfs_btree_cur *cur;
383
384 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
385
386 cur->bc_tp = tp;
387 cur->bc_mp = mp;
388 cur->bc_btnum = btnum;
389 if (btnum == XFS_BTNUM_INO) {
390 cur->bc_nlevels = be32_to_cpu(agi->agi_level);
391 cur->bc_ops = &xfs_inobt_ops;
392 } else {
393 cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
394 cur->bc_ops = &xfs_finobt_ops;
395 }
396
397 cur->bc_blocklog = mp->m_sb.sb_blocklog;
398
399 if (xfs_sb_version_hascrc(&mp->m_sb))
400 cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
401
402 cur->bc_private.a.agbp = agbp;
403 cur->bc_private.a.agno = agno;
404
405 return cur;
406}
407
408/*
409 * Calculate number of records in an inobt btree block.
410 */
411int
412xfs_inobt_maxrecs(
413 struct xfs_mount *mp,
414 int blocklen,
415 int leaf)
416{
417 blocklen -= XFS_INOBT_BLOCK_LEN(mp);
418
419 if (leaf)
420 return blocklen / sizeof(xfs_inobt_rec_t);
421 return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
422}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
new file mode 100644
index 000000000000..d7ebea72c2d0
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -0,0 +1,65 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_IALLOC_BTREE_H__
19#define __XFS_IALLOC_BTREE_H__
20
21/*
22 * Inode map on-disk structures
23 */
24
25struct xfs_buf;
26struct xfs_btree_cur;
27struct xfs_mount;
28
29/*
30 * Btree block header size depends on a superblock flag.
31 */
32#define XFS_INOBT_BLOCK_LEN(mp) \
33 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
34 XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
35
36/*
37 * Record, key, and pointer address macros for btree blocks.
38 *
39 * (note that some of these may appear unused, but they are used in userspace)
40 */
41#define XFS_INOBT_REC_ADDR(mp, block, index) \
42 ((xfs_inobt_rec_t *) \
43 ((char *)(block) + \
44 XFS_INOBT_BLOCK_LEN(mp) + \
45 (((index) - 1) * sizeof(xfs_inobt_rec_t))))
46
47#define XFS_INOBT_KEY_ADDR(mp, block, index) \
48 ((xfs_inobt_key_t *) \
49 ((char *)(block) + \
50 XFS_INOBT_BLOCK_LEN(mp) + \
51 ((index) - 1) * sizeof(xfs_inobt_key_t)))
52
53#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
54 ((xfs_inobt_ptr_t *) \
55 ((char *)(block) + \
56 XFS_INOBT_BLOCK_LEN(mp) + \
57 (maxrecs) * sizeof(xfs_inobt_key_t) + \
58 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
59
60extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
61 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
62 xfs_btnum_t);
63extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
64
65#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
new file mode 100644
index 000000000000..f18fd2da49f7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -0,0 +1,479 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27#include "xfs_inode.h"
28#include "xfs_error.h"
29#include "xfs_cksum.h"
30#include "xfs_icache.h"
31#include "xfs_trans.h"
32#include "xfs_ialloc.h"
33#include "xfs_dinode.h"
34
35/*
36 * Check that none of the inode's in the buffer have a next
37 * unlinked field of 0.
38 */
39#if defined(DEBUG)
40void
41xfs_inobp_check(
42 xfs_mount_t *mp,
43 xfs_buf_t *bp)
44{
45 int i;
46 int j;
47 xfs_dinode_t *dip;
48
49 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
50
51 for (i = 0; i < j; i++) {
52 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
53 i * mp->m_sb.sb_inodesize);
54 if (!dip->di_next_unlinked) {
55 xfs_alert(mp,
56 "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
57 i, (long long)bp->b_bn);
58 }
59 }
60}
61#endif
62
63/*
64 * If we are doing readahead on an inode buffer, we might be in log recovery
65 * reading an inode allocation buffer that hasn't yet been replayed, and hence
66 * has not had the inode cores stamped into it. Hence for readahead, the buffer
67 * may be potentially invalid.
68 *
69 * If the readahead buffer is invalid, we don't want to mark it with an error,
70 * but we do want to clear the DONE status of the buffer so that a followup read
71 * will re-read it from disk. This will ensure that we don't get an unnecessary
72 * warnings during log recovery and we don't get unnecssary panics on debug
73 * kernels.
74 */
75static void
76xfs_inode_buf_verify(
77 struct xfs_buf *bp,
78 bool readahead)
79{
80 struct xfs_mount *mp = bp->b_target->bt_mount;
81 int i;
82 int ni;
83
84 /*
85 * Validate the magic number and version of every inode in the buffer
86 */
87 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
88 for (i = 0; i < ni; i++) {
89 int di_ok;
90 xfs_dinode_t *dip;
91
92 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
93 (i << mp->m_sb.sb_inodelog));
94 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
95 XFS_DINODE_GOOD_VERSION(dip->di_version);
96 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
97 XFS_ERRTAG_ITOBP_INOTOBP,
98 XFS_RANDOM_ITOBP_INOTOBP))) {
99 if (readahead) {
100 bp->b_flags &= ~XBF_DONE;
101 return;
102 }
103
104 xfs_buf_ioerror(bp, -EFSCORRUPTED);
105 xfs_verifier_error(bp);
106#ifdef DEBUG
107 xfs_alert(mp,
108 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
109 (unsigned long long)bp->b_bn, i,
110 be16_to_cpu(dip->di_magic));
111#endif
112 }
113 }
114 xfs_inobp_check(mp, bp);
115}
116
117
118static void
119xfs_inode_buf_read_verify(
120 struct xfs_buf *bp)
121{
122 xfs_inode_buf_verify(bp, false);
123}
124
125static void
126xfs_inode_buf_readahead_verify(
127 struct xfs_buf *bp)
128{
129 xfs_inode_buf_verify(bp, true);
130}
131
132static void
133xfs_inode_buf_write_verify(
134 struct xfs_buf *bp)
135{
136 xfs_inode_buf_verify(bp, false);
137}
138
139const struct xfs_buf_ops xfs_inode_buf_ops = {
140 .verify_read = xfs_inode_buf_read_verify,
141 .verify_write = xfs_inode_buf_write_verify,
142};
143
144const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
145 .verify_read = xfs_inode_buf_readahead_verify,
146 .verify_write = xfs_inode_buf_write_verify,
147};
148
149
150/*
151 * This routine is called to map an inode to the buffer containing the on-disk
152 * version of the inode. It returns a pointer to the buffer containing the
153 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
154 * pointer to the on-disk inode within that buffer.
155 *
156 * If a non-zero error is returned, then the contents of bpp and dipp are
157 * undefined.
158 */
159int
160xfs_imap_to_bp(
161 struct xfs_mount *mp,
162 struct xfs_trans *tp,
163 struct xfs_imap *imap,
164 struct xfs_dinode **dipp,
165 struct xfs_buf **bpp,
166 uint buf_flags,
167 uint iget_flags)
168{
169 struct xfs_buf *bp;
170 int error;
171
172 buf_flags |= XBF_UNMAPPED;
173 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
174 (int)imap->im_len, buf_flags, &bp,
175 &xfs_inode_buf_ops);
176 if (error) {
177 if (error == -EAGAIN) {
178 ASSERT(buf_flags & XBF_TRYLOCK);
179 return error;
180 }
181
182 if (error == -EFSCORRUPTED &&
183 (iget_flags & XFS_IGET_UNTRUSTED))
184 return -EINVAL;
185
186 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
187 __func__, error);
188 return error;
189 }
190
191 *bpp = bp;
192 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
193 return 0;
194}
195
196void
197xfs_dinode_from_disk(
198 xfs_icdinode_t *to,
199 xfs_dinode_t *from)
200{
201 to->di_magic = be16_to_cpu(from->di_magic);
202 to->di_mode = be16_to_cpu(from->di_mode);
203 to->di_version = from ->di_version;
204 to->di_format = from->di_format;
205 to->di_onlink = be16_to_cpu(from->di_onlink);
206 to->di_uid = be32_to_cpu(from->di_uid);
207 to->di_gid = be32_to_cpu(from->di_gid);
208 to->di_nlink = be32_to_cpu(from->di_nlink);
209 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
210 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
211 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
212 to->di_flushiter = be16_to_cpu(from->di_flushiter);
213 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
214 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
215 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
216 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
217 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
218 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
219 to->di_size = be64_to_cpu(from->di_size);
220 to->di_nblocks = be64_to_cpu(from->di_nblocks);
221 to->di_extsize = be32_to_cpu(from->di_extsize);
222 to->di_nextents = be32_to_cpu(from->di_nextents);
223 to->di_anextents = be16_to_cpu(from->di_anextents);
224 to->di_forkoff = from->di_forkoff;
225 to->di_aformat = from->di_aformat;
226 to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
227 to->di_dmstate = be16_to_cpu(from->di_dmstate);
228 to->di_flags = be16_to_cpu(from->di_flags);
229 to->di_gen = be32_to_cpu(from->di_gen);
230
231 if (to->di_version == 3) {
232 to->di_changecount = be64_to_cpu(from->di_changecount);
233 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
234 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
235 to->di_flags2 = be64_to_cpu(from->di_flags2);
236 to->di_ino = be64_to_cpu(from->di_ino);
237 to->di_lsn = be64_to_cpu(from->di_lsn);
238 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
239 uuid_copy(&to->di_uuid, &from->di_uuid);
240 }
241}
242
243void
244xfs_dinode_to_disk(
245 xfs_dinode_t *to,
246 xfs_icdinode_t *from)
247{
248 to->di_magic = cpu_to_be16(from->di_magic);
249 to->di_mode = cpu_to_be16(from->di_mode);
250 to->di_version = from ->di_version;
251 to->di_format = from->di_format;
252 to->di_onlink = cpu_to_be16(from->di_onlink);
253 to->di_uid = cpu_to_be32(from->di_uid);
254 to->di_gid = cpu_to_be32(from->di_gid);
255 to->di_nlink = cpu_to_be32(from->di_nlink);
256 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
257 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
258 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
259 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
260 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
261 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
262 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
263 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
264 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
265 to->di_size = cpu_to_be64(from->di_size);
266 to->di_nblocks = cpu_to_be64(from->di_nblocks);
267 to->di_extsize = cpu_to_be32(from->di_extsize);
268 to->di_nextents = cpu_to_be32(from->di_nextents);
269 to->di_anextents = cpu_to_be16(from->di_anextents);
270 to->di_forkoff = from->di_forkoff;
271 to->di_aformat = from->di_aformat;
272 to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
273 to->di_dmstate = cpu_to_be16(from->di_dmstate);
274 to->di_flags = cpu_to_be16(from->di_flags);
275 to->di_gen = cpu_to_be32(from->di_gen);
276
277 if (from->di_version == 3) {
278 to->di_changecount = cpu_to_be64(from->di_changecount);
279 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
280 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
281 to->di_flags2 = cpu_to_be64(from->di_flags2);
282 to->di_ino = cpu_to_be64(from->di_ino);
283 to->di_lsn = cpu_to_be64(from->di_lsn);
284 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
285 uuid_copy(&to->di_uuid, &from->di_uuid);
286 to->di_flushiter = 0;
287 } else {
288 to->di_flushiter = cpu_to_be16(from->di_flushiter);
289 }
290}
291
292static bool
293xfs_dinode_verify(
294 struct xfs_mount *mp,
295 struct xfs_inode *ip,
296 struct xfs_dinode *dip)
297{
298 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
299 return false;
300
301 /* only version 3 or greater inodes are extensively verified here */
302 if (dip->di_version < 3)
303 return true;
304
305 if (!xfs_sb_version_hascrc(&mp->m_sb))
306 return false;
307 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
308 XFS_DINODE_CRC_OFF))
309 return false;
310 if (be64_to_cpu(dip->di_ino) != ip->i_ino)
311 return false;
312 if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
313 return false;
314 return true;
315}
316
317void
318xfs_dinode_calc_crc(
319 struct xfs_mount *mp,
320 struct xfs_dinode *dip)
321{
322 __uint32_t crc;
323
324 if (dip->di_version < 3)
325 return;
326
327 ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
328 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
329 XFS_DINODE_CRC_OFF);
330 dip->di_crc = xfs_end_cksum(crc);
331}
332
333/*
334 * Read the disk inode attributes into the in-core inode structure.
335 *
336 * For version 5 superblocks, if we are initialising a new inode and we are not
337 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
338 * inode core with a random generation number. If we are keeping inodes around,
339 * we need to read the inode cluster to get the existing generation number off
340 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
341 * format) then log recovery is dependent on the di_flushiter field being
342 * initialised from the current on-disk value and hence we must also read the
343 * inode off disk.
344 */
345int
346xfs_iread(
347 xfs_mount_t *mp,
348 xfs_trans_t *tp,
349 xfs_inode_t *ip,
350 uint iget_flags)
351{
352 xfs_buf_t *bp;
353 xfs_dinode_t *dip;
354 int error;
355
356 /*
357 * Fill in the location information in the in-core inode.
358 */
359 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
360 if (error)
361 return error;
362
363 /* shortcut IO on inode allocation if possible */
364 if ((iget_flags & XFS_IGET_CREATE) &&
365 xfs_sb_version_hascrc(&mp->m_sb) &&
366 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
367 /* initialise the on-disk inode core */
368 memset(&ip->i_d, 0, sizeof(ip->i_d));
369 ip->i_d.di_magic = XFS_DINODE_MAGIC;
370 ip->i_d.di_gen = prandom_u32();
371 if (xfs_sb_version_hascrc(&mp->m_sb)) {
372 ip->i_d.di_version = 3;
373 ip->i_d.di_ino = ip->i_ino;
374 uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
375 } else
376 ip->i_d.di_version = 2;
377 return 0;
378 }
379
380 /*
381 * Get pointers to the on-disk inode and the buffer containing it.
382 */
383 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
384 if (error)
385 return error;
386
387 /* even unallocated inodes are verified */
388 if (!xfs_dinode_verify(mp, ip, dip)) {
389 xfs_alert(mp, "%s: validation failed for inode %lld failed",
390 __func__, ip->i_ino);
391
392 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
393 error = -EFSCORRUPTED;
394 goto out_brelse;
395 }
396
397 /*
398 * If the on-disk inode is already linked to a directory
399 * entry, copy all of the inode into the in-core inode.
400 * xfs_iformat_fork() handles copying in the inode format
401 * specific information.
402 * Otherwise, just get the truly permanent information.
403 */
404 if (dip->di_mode) {
405 xfs_dinode_from_disk(&ip->i_d, dip);
406 error = xfs_iformat_fork(ip, dip);
407 if (error) {
408#ifdef DEBUG
409 xfs_alert(mp, "%s: xfs_iformat() returned error %d",
410 __func__, error);
411#endif /* DEBUG */
412 goto out_brelse;
413 }
414 } else {
415 /*
416 * Partial initialisation of the in-core inode. Just the bits
417 * that xfs_ialloc won't overwrite or relies on being correct.
418 */
419 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
420 ip->i_d.di_version = dip->di_version;
421 ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
422 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
423
424 if (dip->di_version == 3) {
425 ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
426 uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
427 }
428
429 /*
430 * Make sure to pull in the mode here as well in
431 * case the inode is released without being used.
432 * This ensures that xfs_inactive() will see that
433 * the inode is already free and not try to mess
434 * with the uninitialized part of it.
435 */
436 ip->i_d.di_mode = 0;
437 }
438
439 /*
440 * Automatically convert version 1 inode formats in memory to version 2
441 * inode format. If the inode is modified, it will get logged and
442 * rewritten as a version 2 inode. We can do this because we set the
443 * superblock feature bit for v2 inodes unconditionally during mount
444 * and it means the reast of the code can assume the inode version is 2
445 * or higher.
446 */
447 if (ip->i_d.di_version == 1) {
448 ip->i_d.di_version = 2;
449 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
450 ip->i_d.di_nlink = ip->i_d.di_onlink;
451 ip->i_d.di_onlink = 0;
452 xfs_set_projid(ip, 0);
453 }
454
455 ip->i_delayed_blks = 0;
456
457 /*
458 * Mark the buffer containing the inode as something to keep
459 * around for a while. This helps to keep recently accessed
460 * meta-data in-core longer.
461 */
462 xfs_buf_set_ref(bp, XFS_INO_REF);
463
464 /*
465 * Use xfs_trans_brelse() to release the buffer containing the on-disk
466 * inode, because it was acquired with xfs_trans_read_buf() in
467 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
468 * brelse(). If we're within a transaction, then xfs_trans_brelse()
469 * will only release the buffer if it is not dirty within the
470 * transaction. It will be OK to release the buffer in this case,
471 * because inodes on disk are never destroyed and we will be locking the
472 * new in-core inode before putting it in the cache where other
473 * processes can find it. Thus we don't have to worry about the inode
474 * being changed just because we released the buffer.
475 */
476 out_brelse:
477 xfs_trans_brelse(tp, bp);
478 return error;
479}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
new file mode 100644
index 000000000000..9308c47f2a52
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -0,0 +1,50 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_INODE_BUF_H__
19#define __XFS_INODE_BUF_H__
20
21struct xfs_inode;
22struct xfs_dinode;
23struct xfs_icdinode;
24
25/*
26 * Inode location information. Stored in the inode and passed to
27 * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
28 */
29struct xfs_imap {
30 xfs_daddr_t im_blkno; /* starting BB of inode chunk */
31 ushort im_len; /* length in BBs of inode chunk */
32 ushort im_boffset; /* inode offset in block in bytes */
33};
34
35int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
36 struct xfs_imap *, struct xfs_dinode **,
37 struct xfs_buf **, uint, uint);
38int xfs_iread(struct xfs_mount *, struct xfs_trans *,
39 struct xfs_inode *, uint);
40void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
41void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
42void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
43
44#if defined(DEBUG)
45void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
46#else
47#define xfs_inobp_check(mp, bp)
48#endif /* DEBUG */
49
50#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
new file mode 100644
index 000000000000..8ac9411bcf2a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -0,0 +1,1906 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include <linux/log2.h>
19
20#include "xfs.h"
21#include "xfs_fs.h"
22#include "xfs_format.h"
23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h"
25#include "xfs_inum.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_inode.h"
30#include "xfs_trans.h"
31#include "xfs_inode_item.h"
32#include "xfs_bmap_btree.h"
33#include "xfs_bmap.h"
34#include "xfs_error.h"
35#include "xfs_trace.h"
36#include "xfs_attr_sf.h"
37#include "xfs_dinode.h"
38
39kmem_zone_t *xfs_ifork_zone;
40
41STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
42STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
43STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
44
45#ifdef DEBUG
46/*
47 * Make sure that the extents in the given memory buffer
48 * are valid.
49 */
50void
51xfs_validate_extents(
52 xfs_ifork_t *ifp,
53 int nrecs,
54 xfs_exntfmt_t fmt)
55{
56 xfs_bmbt_irec_t irec;
57 xfs_bmbt_rec_host_t rec;
58 int i;
59
60 for (i = 0; i < nrecs; i++) {
61 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
62 rec.l0 = get_unaligned(&ep->l0);
63 rec.l1 = get_unaligned(&ep->l1);
64 xfs_bmbt_get_all(&rec, &irec);
65 if (fmt == XFS_EXTFMT_NOSTATE)
66 ASSERT(irec.br_state == XFS_EXT_NORM);
67 }
68}
69#else /* DEBUG */
70#define xfs_validate_extents(ifp, nrecs, fmt)
71#endif /* DEBUG */
72
73
74/*
75 * Move inode type and inode format specific information from the
76 * on-disk inode to the in-core inode. For fifos, devs, and sockets
77 * this means set if_rdev to the proper value. For files, directories,
78 * and symlinks this means to bring in the in-line data or extent
79 * pointers. For a file in B-tree format, only the root is immediately
80 * brought in-core. The rest will be in-lined in if_extents when it
81 * is first referenced (see xfs_iread_extents()).
82 */
83int
84xfs_iformat_fork(
85 xfs_inode_t *ip,
86 xfs_dinode_t *dip)
87{
88 xfs_attr_shortform_t *atp;
89 int size;
90 int error = 0;
91 xfs_fsize_t di_size;
92
93 if (unlikely(be32_to_cpu(dip->di_nextents) +
94 be16_to_cpu(dip->di_anextents) >
95 be64_to_cpu(dip->di_nblocks))) {
96 xfs_warn(ip->i_mount,
97 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
98 (unsigned long long)ip->i_ino,
99 (int)(be32_to_cpu(dip->di_nextents) +
100 be16_to_cpu(dip->di_anextents)),
101 (unsigned long long)
102 be64_to_cpu(dip->di_nblocks));
103 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
104 ip->i_mount, dip);
105 return -EFSCORRUPTED;
106 }
107
108 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
109 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
110 (unsigned long long)ip->i_ino,
111 dip->di_forkoff);
112 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
113 ip->i_mount, dip);
114 return -EFSCORRUPTED;
115 }
116
117 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
118 !ip->i_mount->m_rtdev_targp)) {
119 xfs_warn(ip->i_mount,
120 "corrupt dinode %Lu, has realtime flag set.",
121 ip->i_ino);
122 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
123 XFS_ERRLEVEL_LOW, ip->i_mount, dip);
124 return -EFSCORRUPTED;
125 }
126
127 switch (ip->i_d.di_mode & S_IFMT) {
128 case S_IFIFO:
129 case S_IFCHR:
130 case S_IFBLK:
131 case S_IFSOCK:
132 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
133 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
134 ip->i_mount, dip);
135 return -EFSCORRUPTED;
136 }
137 ip->i_d.di_size = 0;
138 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
139 break;
140
141 case S_IFREG:
142 case S_IFLNK:
143 case S_IFDIR:
144 switch (dip->di_format) {
145 case XFS_DINODE_FMT_LOCAL:
146 /*
147 * no local regular files yet
148 */
149 if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
150 xfs_warn(ip->i_mount,
151 "corrupt inode %Lu (local format for regular file).",
152 (unsigned long long) ip->i_ino);
153 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
154 XFS_ERRLEVEL_LOW,
155 ip->i_mount, dip);
156 return -EFSCORRUPTED;
157 }
158
159 di_size = be64_to_cpu(dip->di_size);
160 if (unlikely(di_size < 0 ||
161 di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
162 xfs_warn(ip->i_mount,
163 "corrupt inode %Lu (bad size %Ld for local inode).",
164 (unsigned long long) ip->i_ino,
165 (long long) di_size);
166 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
167 XFS_ERRLEVEL_LOW,
168 ip->i_mount, dip);
169 return -EFSCORRUPTED;
170 }
171
172 size = (int)di_size;
173 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
174 break;
175 case XFS_DINODE_FMT_EXTENTS:
176 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
177 break;
178 case XFS_DINODE_FMT_BTREE:
179 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
180 break;
181 default:
182 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
183 ip->i_mount);
184 return -EFSCORRUPTED;
185 }
186 break;
187
188 default:
189 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
190 return -EFSCORRUPTED;
191 }
192 if (error) {
193 return error;
194 }
195 if (!XFS_DFORK_Q(dip))
196 return 0;
197
198 ASSERT(ip->i_afp == NULL);
199 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
200
201 switch (dip->di_aformat) {
202 case XFS_DINODE_FMT_LOCAL:
203 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
204 size = be16_to_cpu(atp->hdr.totsize);
205
206 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
207 xfs_warn(ip->i_mount,
208 "corrupt inode %Lu (bad attr fork size %Ld).",
209 (unsigned long long) ip->i_ino,
210 (long long) size);
211 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
212 XFS_ERRLEVEL_LOW,
213 ip->i_mount, dip);
214 return -EFSCORRUPTED;
215 }
216
217 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
218 break;
219 case XFS_DINODE_FMT_EXTENTS:
220 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
221 break;
222 case XFS_DINODE_FMT_BTREE:
223 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
224 break;
225 default:
226 error = -EFSCORRUPTED;
227 break;
228 }
229 if (error) {
230 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
231 ip->i_afp = NULL;
232 xfs_idestroy_fork(ip, XFS_DATA_FORK);
233 }
234 return error;
235}
236
237/*
238 * The file is in-lined in the on-disk inode.
239 * If it fits into if_inline_data, then copy
240 * it there, otherwise allocate a buffer for it
241 * and copy the data there. Either way, set
242 * if_data to point at the data.
243 * If we allocate a buffer for the data, make
244 * sure that its size is a multiple of 4 and
245 * record the real size in i_real_bytes.
246 */
247STATIC int
248xfs_iformat_local(
249 xfs_inode_t *ip,
250 xfs_dinode_t *dip,
251 int whichfork,
252 int size)
253{
254 xfs_ifork_t *ifp;
255 int real_size;
256
257 /*
258 * If the size is unreasonable, then something
259 * is wrong and we just bail out rather than crash in
260 * kmem_alloc() or memcpy() below.
261 */
262 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
263 xfs_warn(ip->i_mount,
264 "corrupt inode %Lu (bad size %d for local fork, size = %d).",
265 (unsigned long long) ip->i_ino, size,
266 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
267 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
268 ip->i_mount, dip);
269 return -EFSCORRUPTED;
270 }
271 ifp = XFS_IFORK_PTR(ip, whichfork);
272 real_size = 0;
273 if (size == 0)
274 ifp->if_u1.if_data = NULL;
275 else if (size <= sizeof(ifp->if_u2.if_inline_data))
276 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
277 else {
278 real_size = roundup(size, 4);
279 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
280 }
281 ifp->if_bytes = size;
282 ifp->if_real_bytes = real_size;
283 if (size)
284 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
285 ifp->if_flags &= ~XFS_IFEXTENTS;
286 ifp->if_flags |= XFS_IFINLINE;
287 return 0;
288}
289
290/*
291 * The file consists of a set of extents all
292 * of which fit into the on-disk inode.
293 * If there are few enough extents to fit into
294 * the if_inline_ext, then copy them there.
295 * Otherwise allocate a buffer for them and copy
296 * them into it. Either way, set if_extents
297 * to point at the extents.
298 */
299STATIC int
300xfs_iformat_extents(
301 xfs_inode_t *ip,
302 xfs_dinode_t *dip,
303 int whichfork)
304{
305 xfs_bmbt_rec_t *dp;
306 xfs_ifork_t *ifp;
307 int nex;
308 int size;
309 int i;
310
311 ifp = XFS_IFORK_PTR(ip, whichfork);
312 nex = XFS_DFORK_NEXTENTS(dip, whichfork);
313 size = nex * (uint)sizeof(xfs_bmbt_rec_t);
314
315 /*
316 * If the number of extents is unreasonable, then something
317 * is wrong and we just bail out rather than crash in
318 * kmem_alloc() or memcpy() below.
319 */
320 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
321 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
322 (unsigned long long) ip->i_ino, nex);
323 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
324 ip->i_mount, dip);
325 return -EFSCORRUPTED;
326 }
327
328 ifp->if_real_bytes = 0;
329 if (nex == 0)
330 ifp->if_u1.if_extents = NULL;
331 else if (nex <= XFS_INLINE_EXTS)
332 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
333 else
334 xfs_iext_add(ifp, 0, nex);
335
336 ifp->if_bytes = size;
337 if (size) {
338 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
339 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
340 for (i = 0; i < nex; i++, dp++) {
341 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
342 ep->l0 = get_unaligned_be64(&dp->l0);
343 ep->l1 = get_unaligned_be64(&dp->l1);
344 }
345 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
346 if (whichfork != XFS_DATA_FORK ||
347 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
348 if (unlikely(xfs_check_nostate_extents(
349 ifp, 0, nex))) {
350 XFS_ERROR_REPORT("xfs_iformat_extents(2)",
351 XFS_ERRLEVEL_LOW,
352 ip->i_mount);
353 return -EFSCORRUPTED;
354 }
355 }
356 ifp->if_flags |= XFS_IFEXTENTS;
357 return 0;
358}
359
360/*
361 * The file has too many extents to fit into
362 * the inode, so they are in B-tree format.
363 * Allocate a buffer for the root of the B-tree
364 * and copy the root into it. The i_extents
365 * field will remain NULL until all of the
366 * extents are read in (when they are needed).
367 */
368STATIC int
369xfs_iformat_btree(
370 xfs_inode_t *ip,
371 xfs_dinode_t *dip,
372 int whichfork)
373{
374 struct xfs_mount *mp = ip->i_mount;
375 xfs_bmdr_block_t *dfp;
376 xfs_ifork_t *ifp;
377 /* REFERENCED */
378 int nrecs;
379 int size;
380
381 ifp = XFS_IFORK_PTR(ip, whichfork);
382 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
383 size = XFS_BMAP_BROOT_SPACE(mp, dfp);
384 nrecs = be16_to_cpu(dfp->bb_numrecs);
385
386 /*
387 * blow out if -- fork has less extents than can fit in
388 * fork (fork shouldn't be a btree format), root btree
389 * block has more records than can fit into the fork,
390 * or the number of extents is greater than the number of
391 * blocks.
392 */
393 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
394 XFS_IFORK_MAXEXT(ip, whichfork) ||
395 XFS_BMDR_SPACE_CALC(nrecs) >
396 XFS_DFORK_SIZE(dip, mp, whichfork) ||
397 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
398 xfs_warn(mp, "corrupt inode %Lu (btree).",
399 (unsigned long long) ip->i_ino);
400 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
401 mp, dip);
402 return -EFSCORRUPTED;
403 }
404
405 ifp->if_broot_bytes = size;
406 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
407 ASSERT(ifp->if_broot != NULL);
408 /*
409 * Copy and convert from the on-disk structure
410 * to the in-memory structure.
411 */
412 xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
413 ifp->if_broot, size);
414 ifp->if_flags &= ~XFS_IFEXTENTS;
415 ifp->if_flags |= XFS_IFBROOT;
416
417 return 0;
418}
419
420/*
421 * Read in extents from a btree-format inode.
422 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
423 */
424int
425xfs_iread_extents(
426 xfs_trans_t *tp,
427 xfs_inode_t *ip,
428 int whichfork)
429{
430 int error;
431 xfs_ifork_t *ifp;
432 xfs_extnum_t nextents;
433
434 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
435
436 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
437 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
438 ip->i_mount);
439 return -EFSCORRUPTED;
440 }
441 nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
442 ifp = XFS_IFORK_PTR(ip, whichfork);
443
444 /*
445 * We know that the size is valid (it's checked in iformat_btree)
446 */
447 ifp->if_bytes = ifp->if_real_bytes = 0;
448 ifp->if_flags |= XFS_IFEXTENTS;
449 xfs_iext_add(ifp, 0, nextents);
450 error = xfs_bmap_read_extents(tp, ip, whichfork);
451 if (error) {
452 xfs_iext_destroy(ifp);
453 ifp->if_flags &= ~XFS_IFEXTENTS;
454 return error;
455 }
456 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
457 return 0;
458}
459/*
460 * Reallocate the space for if_broot based on the number of records
461 * being added or deleted as indicated in rec_diff. Move the records
462 * and pointers in if_broot to fit the new size. When shrinking this
463 * will eliminate holes between the records and pointers created by
464 * the caller. When growing this will create holes to be filled in
465 * by the caller.
466 *
467 * The caller must not request to add more records than would fit in
468 * the on-disk inode root. If the if_broot is currently NULL, then
469 * if we are adding records, one will be allocated. The caller must also
470 * not request that the number of records go below zero, although
471 * it can go to zero.
472 *
473 * ip -- the inode whose if_broot area is changing
474 * ext_diff -- the change in the number of records, positive or negative,
475 * requested for the if_broot array.
476 */
477void
478xfs_iroot_realloc(
479 xfs_inode_t *ip,
480 int rec_diff,
481 int whichfork)
482{
483 struct xfs_mount *mp = ip->i_mount;
484 int cur_max;
485 xfs_ifork_t *ifp;
486 struct xfs_btree_block *new_broot;
487 int new_max;
488 size_t new_size;
489 char *np;
490 char *op;
491
492 /*
493 * Handle the degenerate case quietly.
494 */
495 if (rec_diff == 0) {
496 return;
497 }
498
499 ifp = XFS_IFORK_PTR(ip, whichfork);
500 if (rec_diff > 0) {
501 /*
502 * If there wasn't any memory allocated before, just
503 * allocate it now and get out.
504 */
505 if (ifp->if_broot_bytes == 0) {
506 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
507 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
508 ifp->if_broot_bytes = (int)new_size;
509 return;
510 }
511
512 /*
513 * If there is already an existing if_broot, then we need
514 * to realloc() it and shift the pointers to their new
515 * location. The records don't change location because
516 * they are kept butted up against the btree block header.
517 */
518 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
519 new_max = cur_max + rec_diff;
520 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
521 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
522 XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
523 KM_SLEEP | KM_NOFS);
524 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
525 ifp->if_broot_bytes);
526 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
527 (int)new_size);
528 ifp->if_broot_bytes = (int)new_size;
529 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
530 XFS_IFORK_SIZE(ip, whichfork));
531 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
532 return;
533 }
534
535 /*
536 * rec_diff is less than 0. In this case, we are shrinking the
537 * if_broot buffer. It must already exist. If we go to zero
538 * records, just get rid of the root and clear the status bit.
539 */
540 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
541 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
542 new_max = cur_max + rec_diff;
543 ASSERT(new_max >= 0);
544 if (new_max > 0)
545 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
546 else
547 new_size = 0;
548 if (new_size > 0) {
549 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
550 /*
551 * First copy over the btree block header.
552 */
553 memcpy(new_broot, ifp->if_broot,
554 XFS_BMBT_BLOCK_LEN(ip->i_mount));
555 } else {
556 new_broot = NULL;
557 ifp->if_flags &= ~XFS_IFBROOT;
558 }
559
560 /*
561 * Only copy the records and pointers if there are any.
562 */
563 if (new_max > 0) {
564 /*
565 * First copy the records.
566 */
567 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
568 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
569 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
570
571 /*
572 * Then copy the pointers.
573 */
574 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
575 ifp->if_broot_bytes);
576 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
577 (int)new_size);
578 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
579 }
580 kmem_free(ifp->if_broot);
581 ifp->if_broot = new_broot;
582 ifp->if_broot_bytes = (int)new_size;
583 if (ifp->if_broot)
584 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
585 XFS_IFORK_SIZE(ip, whichfork));
586 return;
587}
588
589
590/*
591 * This is called when the amount of space needed for if_data
592 * is increased or decreased. The change in size is indicated by
593 * the number of bytes that need to be added or deleted in the
594 * byte_diff parameter.
595 *
596 * If the amount of space needed has decreased below the size of the
597 * inline buffer, then switch to using the inline buffer. Otherwise,
598 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
599 * to what is needed.
600 *
601 * ip -- the inode whose if_data area is changing
602 * byte_diff -- the change in the number of bytes, positive or negative,
603 * requested for the if_data array.
604 */
605void
606xfs_idata_realloc(
607 xfs_inode_t *ip,
608 int byte_diff,
609 int whichfork)
610{
611 xfs_ifork_t *ifp;
612 int new_size;
613 int real_size;
614
615 if (byte_diff == 0) {
616 return;
617 }
618
619 ifp = XFS_IFORK_PTR(ip, whichfork);
620 new_size = (int)ifp->if_bytes + byte_diff;
621 ASSERT(new_size >= 0);
622
623 if (new_size == 0) {
624 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
625 kmem_free(ifp->if_u1.if_data);
626 }
627 ifp->if_u1.if_data = NULL;
628 real_size = 0;
629 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
630 /*
631 * If the valid extents/data can fit in if_inline_ext/data,
632 * copy them from the malloc'd vector and free it.
633 */
634 if (ifp->if_u1.if_data == NULL) {
635 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
636 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
637 ASSERT(ifp->if_real_bytes != 0);
638 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
639 new_size);
640 kmem_free(ifp->if_u1.if_data);
641 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
642 }
643 real_size = 0;
644 } else {
645 /*
646 * Stuck with malloc/realloc.
647 * For inline data, the underlying buffer must be
648 * a multiple of 4 bytes in size so that it can be
649 * logged and stay on word boundaries. We enforce
650 * that here.
651 */
652 real_size = roundup(new_size, 4);
653 if (ifp->if_u1.if_data == NULL) {
654 ASSERT(ifp->if_real_bytes == 0);
655 ifp->if_u1.if_data = kmem_alloc(real_size,
656 KM_SLEEP | KM_NOFS);
657 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
658 /*
659 * Only do the realloc if the underlying size
660 * is really changing.
661 */
662 if (ifp->if_real_bytes != real_size) {
663 ifp->if_u1.if_data =
664 kmem_realloc(ifp->if_u1.if_data,
665 real_size,
666 ifp->if_real_bytes,
667 KM_SLEEP | KM_NOFS);
668 }
669 } else {
670 ASSERT(ifp->if_real_bytes == 0);
671 ifp->if_u1.if_data = kmem_alloc(real_size,
672 KM_SLEEP | KM_NOFS);
673 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
674 ifp->if_bytes);
675 }
676 }
677 ifp->if_real_bytes = real_size;
678 ifp->if_bytes = new_size;
679 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
680}
681
682void
683xfs_idestroy_fork(
684 xfs_inode_t *ip,
685 int whichfork)
686{
687 xfs_ifork_t *ifp;
688
689 ifp = XFS_IFORK_PTR(ip, whichfork);
690 if (ifp->if_broot != NULL) {
691 kmem_free(ifp->if_broot);
692 ifp->if_broot = NULL;
693 }
694
695 /*
696 * If the format is local, then we can't have an extents
697 * array so just look for an inline data array. If we're
698 * not local then we may or may not have an extents list,
699 * so check and free it up if we do.
700 */
701 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
702 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
703 (ifp->if_u1.if_data != NULL)) {
704 ASSERT(ifp->if_real_bytes != 0);
705 kmem_free(ifp->if_u1.if_data);
706 ifp->if_u1.if_data = NULL;
707 ifp->if_real_bytes = 0;
708 }
709 } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
710 ((ifp->if_flags & XFS_IFEXTIREC) ||
711 ((ifp->if_u1.if_extents != NULL) &&
712 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
713 ASSERT(ifp->if_real_bytes != 0);
714 xfs_iext_destroy(ifp);
715 }
716 ASSERT(ifp->if_u1.if_extents == NULL ||
717 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
718 ASSERT(ifp->if_real_bytes == 0);
719 if (whichfork == XFS_ATTR_FORK) {
720 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
721 ip->i_afp = NULL;
722 }
723}
724
725/*
726 * Convert in-core extents to on-disk form
727 *
728 * For either the data or attr fork in extent format, we need to endian convert
729 * the in-core extent as we place them into the on-disk inode.
730 *
731 * In the case of the data fork, the in-core and on-disk fork sizes can be
732 * different due to delayed allocation extents. We only copy on-disk extents
733 * here, so callers must always use the physical fork size to determine the
734 * size of the buffer passed to this routine. We will return the size actually
735 * used.
736 */
737int
738xfs_iextents_copy(
739 xfs_inode_t *ip,
740 xfs_bmbt_rec_t *dp,
741 int whichfork)
742{
743 int copied;
744 int i;
745 xfs_ifork_t *ifp;
746 int nrecs;
747 xfs_fsblock_t start_block;
748
749 ifp = XFS_IFORK_PTR(ip, whichfork);
750 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
751 ASSERT(ifp->if_bytes > 0);
752
753 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
754 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
755 ASSERT(nrecs > 0);
756
757 /*
758 * There are some delayed allocation extents in the
759 * inode, so copy the extents one at a time and skip
760 * the delayed ones. There must be at least one
761 * non-delayed extent.
762 */
763 copied = 0;
764 for (i = 0; i < nrecs; i++) {
765 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
766 start_block = xfs_bmbt_get_startblock(ep);
767 if (isnullstartblock(start_block)) {
768 /*
769 * It's a delayed allocation extent, so skip it.
770 */
771 continue;
772 }
773
774 /* Translate to on disk format */
775 put_unaligned_be64(ep->l0, &dp->l0);
776 put_unaligned_be64(ep->l1, &dp->l1);
777 dp++;
778 copied++;
779 }
780 ASSERT(copied != 0);
781 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
782
783 return (copied * (uint)sizeof(xfs_bmbt_rec_t));
784}
785
786/*
787 * Each of the following cases stores data into the same region
788 * of the on-disk inode, so only one of them can be valid at
789 * any given time. While it is possible to have conflicting formats
790 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
791 * in EXTENTS format, this can only happen when the fork has
792 * changed formats after being modified but before being flushed.
793 * In these cases, the format always takes precedence, because the
794 * format indicates the current state of the fork.
795 */
796void
797xfs_iflush_fork(
798 xfs_inode_t *ip,
799 xfs_dinode_t *dip,
800 xfs_inode_log_item_t *iip,
801 int whichfork)
802{
803 char *cp;
804 xfs_ifork_t *ifp;
805 xfs_mount_t *mp;
806 static const short brootflag[2] =
807 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
808 static const short dataflag[2] =
809 { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
810 static const short extflag[2] =
811 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
812
813 if (!iip)
814 return;
815 ifp = XFS_IFORK_PTR(ip, whichfork);
816 /*
817 * This can happen if we gave up in iformat in an error path,
818 * for the attribute fork.
819 */
820 if (!ifp) {
821 ASSERT(whichfork == XFS_ATTR_FORK);
822 return;
823 }
824 cp = XFS_DFORK_PTR(dip, whichfork);
825 mp = ip->i_mount;
826 switch (XFS_IFORK_FORMAT(ip, whichfork)) {
827 case XFS_DINODE_FMT_LOCAL:
828 if ((iip->ili_fields & dataflag[whichfork]) &&
829 (ifp->if_bytes > 0)) {
830 ASSERT(ifp->if_u1.if_data != NULL);
831 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
832 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
833 }
834 break;
835
836 case XFS_DINODE_FMT_EXTENTS:
837 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
838 !(iip->ili_fields & extflag[whichfork]));
839 if ((iip->ili_fields & extflag[whichfork]) &&
840 (ifp->if_bytes > 0)) {
841 ASSERT(xfs_iext_get_ext(ifp, 0));
842 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
843 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
844 whichfork);
845 }
846 break;
847
848 case XFS_DINODE_FMT_BTREE:
849 if ((iip->ili_fields & brootflag[whichfork]) &&
850 (ifp->if_broot_bytes > 0)) {
851 ASSERT(ifp->if_broot != NULL);
852 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
853 XFS_IFORK_SIZE(ip, whichfork));
854 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
855 (xfs_bmdr_block_t *)cp,
856 XFS_DFORK_SIZE(dip, mp, whichfork));
857 }
858 break;
859
860 case XFS_DINODE_FMT_DEV:
861 if (iip->ili_fields & XFS_ILOG_DEV) {
862 ASSERT(whichfork == XFS_DATA_FORK);
863 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
864 }
865 break;
866
867 case XFS_DINODE_FMT_UUID:
868 if (iip->ili_fields & XFS_ILOG_UUID) {
869 ASSERT(whichfork == XFS_DATA_FORK);
870 memcpy(XFS_DFORK_DPTR(dip),
871 &ip->i_df.if_u2.if_uuid,
872 sizeof(uuid_t));
873 }
874 break;
875
876 default:
877 ASSERT(0);
878 break;
879 }
880}
881
882/*
883 * Return a pointer to the extent record at file index idx.
884 */
885xfs_bmbt_rec_host_t *
886xfs_iext_get_ext(
887 xfs_ifork_t *ifp, /* inode fork pointer */
888 xfs_extnum_t idx) /* index of target extent */
889{
890 ASSERT(idx >= 0);
891 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
892
893 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
894 return ifp->if_u1.if_ext_irec->er_extbuf;
895 } else if (ifp->if_flags & XFS_IFEXTIREC) {
896 xfs_ext_irec_t *erp; /* irec pointer */
897 int erp_idx = 0; /* irec index */
898 xfs_extnum_t page_idx = idx; /* ext index in target list */
899
900 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
901 return &erp->er_extbuf[page_idx];
902 } else if (ifp->if_bytes) {
903 return &ifp->if_u1.if_extents[idx];
904 } else {
905 return NULL;
906 }
907}
908
909/*
910 * Insert new item(s) into the extent records for incore inode
911 * fork 'ifp'. 'count' new items are inserted at index 'idx'.
912 */
913void
914xfs_iext_insert(
915 xfs_inode_t *ip, /* incore inode pointer */
916 xfs_extnum_t idx, /* starting index of new items */
917 xfs_extnum_t count, /* number of inserted items */
918 xfs_bmbt_irec_t *new, /* items to insert */
919 int state) /* type of extent conversion */
920{
921 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
922 xfs_extnum_t i; /* extent record index */
923
924 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
925
926 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
927 xfs_iext_add(ifp, idx, count);
928 for (i = idx; i < idx + count; i++, new++)
929 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
930}
931
932/*
933 * This is called when the amount of space required for incore file
934 * extents needs to be increased. The ext_diff parameter stores the
935 * number of new extents being added and the idx parameter contains
936 * the extent index where the new extents will be added. If the new
937 * extents are being appended, then we just need to (re)allocate and
938 * initialize the space. Otherwise, if the new extents are being
939 * inserted into the middle of the existing entries, a bit more work
940 * is required to make room for the new extents to be inserted. The
941 * caller is responsible for filling in the new extent entries upon
942 * return.
943 */
944void
945xfs_iext_add(
946 xfs_ifork_t *ifp, /* inode fork pointer */
947 xfs_extnum_t idx, /* index to begin adding exts */
948 int ext_diff) /* number of extents to add */
949{
950 int byte_diff; /* new bytes being added */
951 int new_size; /* size of extents after adding */
952 xfs_extnum_t nextents; /* number of extents in file */
953
954 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
955 ASSERT((idx >= 0) && (idx <= nextents));
956 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
957 new_size = ifp->if_bytes + byte_diff;
958 /*
959 * If the new number of extents (nextents + ext_diff)
960 * fits inside the inode, then continue to use the inline
961 * extent buffer.
962 */
963 if (nextents + ext_diff <= XFS_INLINE_EXTS) {
964 if (idx < nextents) {
965 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
966 &ifp->if_u2.if_inline_ext[idx],
967 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
968 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
969 }
970 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
971 ifp->if_real_bytes = 0;
972 }
973 /*
974 * Otherwise use a linear (direct) extent list.
975 * If the extents are currently inside the inode,
976 * xfs_iext_realloc_direct will switch us from
977 * inline to direct extent allocation mode.
978 */
979 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
980 xfs_iext_realloc_direct(ifp, new_size);
981 if (idx < nextents) {
982 memmove(&ifp->if_u1.if_extents[idx + ext_diff],
983 &ifp->if_u1.if_extents[idx],
984 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
985 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
986 }
987 }
988 /* Indirection array */
989 else {
990 xfs_ext_irec_t *erp;
991 int erp_idx = 0;
992 int page_idx = idx;
993
994 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
995 if (ifp->if_flags & XFS_IFEXTIREC) {
996 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
997 } else {
998 xfs_iext_irec_init(ifp);
999 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1000 erp = ifp->if_u1.if_ext_irec;
1001 }
1002 /* Extents fit in target extent page */
1003 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
1004 if (page_idx < erp->er_extcount) {
1005 memmove(&erp->er_extbuf[page_idx + ext_diff],
1006 &erp->er_extbuf[page_idx],
1007 (erp->er_extcount - page_idx) *
1008 sizeof(xfs_bmbt_rec_t));
1009 memset(&erp->er_extbuf[page_idx], 0, byte_diff);
1010 }
1011 erp->er_extcount += ext_diff;
1012 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
1013 }
1014 /* Insert a new extent page */
1015 else if (erp) {
1016 xfs_iext_add_indirect_multi(ifp,
1017 erp_idx, page_idx, ext_diff);
1018 }
1019 /*
1020 * If extent(s) are being appended to the last page in
1021 * the indirection array and the new extent(s) don't fit
1022 * in the page, then erp is NULL and erp_idx is set to
1023 * the next index needed in the indirection array.
1024 */
1025 else {
1026 uint count = ext_diff;
1027
1028 while (count) {
1029 erp = xfs_iext_irec_new(ifp, erp_idx);
1030 erp->er_extcount = min(count, XFS_LINEAR_EXTS);
1031 count -= erp->er_extcount;
1032 if (count)
1033 erp_idx++;
1034 }
1035 }
1036 }
1037 ifp->if_bytes = new_size;
1038}
1039
1040/*
1041 * This is called when incore extents are being added to the indirection
1042 * array and the new extents do not fit in the target extent list. The
1043 * erp_idx parameter contains the irec index for the target extent list
1044 * in the indirection array, and the idx parameter contains the extent
1045 * index within the list. The number of extents being added is stored
1046 * in the count parameter.
1047 *
1048 * |-------| |-------|
1049 * | | | | idx - number of extents before idx
1050 * | idx | | count |
1051 * | | | | count - number of extents being inserted at idx
1052 * |-------| |-------|
1053 * | count | | nex2 | nex2 - number of extents after idx + count
1054 * |-------| |-------|
1055 */
1056void
1057xfs_iext_add_indirect_multi(
1058 xfs_ifork_t *ifp, /* inode fork pointer */
1059 int erp_idx, /* target extent irec index */
1060 xfs_extnum_t idx, /* index within target list */
1061 int count) /* new extents being added */
1062{
1063 int byte_diff; /* new bytes being added */
1064 xfs_ext_irec_t *erp; /* pointer to irec entry */
1065 xfs_extnum_t ext_diff; /* number of extents to add */
1066 xfs_extnum_t ext_cnt; /* new extents still needed */
1067 xfs_extnum_t nex2; /* extents after idx + count */
1068 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
1069 int nlists; /* number of irec's (lists) */
1070
1071 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1072 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1073 nex2 = erp->er_extcount - idx;
1074 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1075
1076 /*
1077 * Save second part of target extent list
1078 * (all extents past */
1079 if (nex2) {
1080 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
1081 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
1082 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
1083 erp->er_extcount -= nex2;
1084 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
1085 memset(&erp->er_extbuf[idx], 0, byte_diff);
1086 }
1087
1088 /*
1089 * Add the new extents to the end of the target
1090 * list, then allocate new irec record(s) and
1091 * extent buffer(s) as needed to store the rest
1092 * of the new extents.
1093 */
1094 ext_cnt = count;
1095 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
1096 if (ext_diff) {
1097 erp->er_extcount += ext_diff;
1098 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
1099 ext_cnt -= ext_diff;
1100 }
1101 while (ext_cnt) {
1102 erp_idx++;
1103 erp = xfs_iext_irec_new(ifp, erp_idx);
1104 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
1105 erp->er_extcount = ext_diff;
1106 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
1107 ext_cnt -= ext_diff;
1108 }
1109
1110 /* Add nex2 extents back to indirection array */
1111 if (nex2) {
1112 xfs_extnum_t ext_avail;
1113 int i;
1114
1115 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
1116 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
1117 i = 0;
1118 /*
1119 * If nex2 extents fit in the current page, append
1120 * nex2_ep after the new extents.
1121 */
1122 if (nex2 <= ext_avail) {
1123 i = erp->er_extcount;
1124 }
1125 /*
1126 * Otherwise, check if space is available in the
1127 * next page.
1128 */
1129 else if ((erp_idx < nlists - 1) &&
1130 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
1131 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
1132 erp_idx++;
1133 erp++;
1134 /* Create a hole for nex2 extents */
1135 memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
1136 erp->er_extcount * sizeof(xfs_bmbt_rec_t));
1137 }
1138 /*
1139 * Final choice, create a new extent page for
1140 * nex2 extents.
1141 */
1142 else {
1143 erp_idx++;
1144 erp = xfs_iext_irec_new(ifp, erp_idx);
1145 }
1146 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
1147 kmem_free(nex2_ep);
1148 erp->er_extcount += nex2;
1149 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
1150 }
1151}
1152
1153/*
1154 * This is called when the amount of space required for incore file
1155 * extents needs to be decreased. The ext_diff parameter stores the
1156 * number of extents to be removed and the idx parameter contains
1157 * the extent index where the extents will be removed from.
1158 *
1159 * If the amount of space needed has decreased below the linear
1160 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
1161 * extent array. Otherwise, use kmem_realloc() to adjust the
1162 * size to what is needed.
1163 */
1164void
1165xfs_iext_remove(
1166 xfs_inode_t *ip, /* incore inode pointer */
1167 xfs_extnum_t idx, /* index to begin removing exts */
1168 int ext_diff, /* number of extents to remove */
1169 int state) /* type of extent conversion */
1170{
1171 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
1172 xfs_extnum_t nextents; /* number of extents in file */
1173 int new_size; /* size of extents after removal */
1174
1175 trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
1176
1177 ASSERT(ext_diff > 0);
1178 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1179 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
1180
1181 if (new_size == 0) {
1182 xfs_iext_destroy(ifp);
1183 } else if (ifp->if_flags & XFS_IFEXTIREC) {
1184 xfs_iext_remove_indirect(ifp, idx, ext_diff);
1185 } else if (ifp->if_real_bytes) {
1186 xfs_iext_remove_direct(ifp, idx, ext_diff);
1187 } else {
1188 xfs_iext_remove_inline(ifp, idx, ext_diff);
1189 }
1190 ifp->if_bytes = new_size;
1191}
1192
1193/*
1194 * This removes ext_diff extents from the inline buffer, beginning
1195 * at extent index idx.
1196 */
1197void
1198xfs_iext_remove_inline(
1199 xfs_ifork_t *ifp, /* inode fork pointer */
1200 xfs_extnum_t idx, /* index to begin removing exts */
1201 int ext_diff) /* number of extents to remove */
1202{
1203 int nextents; /* number of extents in file */
1204
1205 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
1206 ASSERT(idx < XFS_INLINE_EXTS);
1207 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1208 ASSERT(((nextents - ext_diff) > 0) &&
1209 (nextents - ext_diff) < XFS_INLINE_EXTS);
1210
1211 if (idx + ext_diff < nextents) {
1212 memmove(&ifp->if_u2.if_inline_ext[idx],
1213 &ifp->if_u2.if_inline_ext[idx + ext_diff],
1214 (nextents - (idx + ext_diff)) *
1215 sizeof(xfs_bmbt_rec_t));
1216 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
1217 0, ext_diff * sizeof(xfs_bmbt_rec_t));
1218 } else {
1219 memset(&ifp->if_u2.if_inline_ext[idx], 0,
1220 ext_diff * sizeof(xfs_bmbt_rec_t));
1221 }
1222}
1223
1224/*
1225 * This removes ext_diff extents from a linear (direct) extent list,
1226 * beginning at extent index idx. If the extents are being removed
1227 * from the end of the list (ie. truncate) then we just need to re-
1228 * allocate the list to remove the extra space. Otherwise, if the
1229 * extents are being removed from the middle of the existing extent
1230 * entries, then we first need to move the extent records beginning
1231 * at idx + ext_diff up in the list to overwrite the records being
1232 * removed, then remove the extra space via kmem_realloc.
1233 */
1234void
1235xfs_iext_remove_direct(
1236 xfs_ifork_t *ifp, /* inode fork pointer */
1237 xfs_extnum_t idx, /* index to begin removing exts */
1238 int ext_diff) /* number of extents to remove */
1239{
1240 xfs_extnum_t nextents; /* number of extents in file */
1241 int new_size; /* size of extents after removal */
1242
1243 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
1244 new_size = ifp->if_bytes -
1245 (ext_diff * sizeof(xfs_bmbt_rec_t));
1246 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1247
1248 if (new_size == 0) {
1249 xfs_iext_destroy(ifp);
1250 return;
1251 }
1252 /* Move extents up in the list (if needed) */
1253 if (idx + ext_diff < nextents) {
1254 memmove(&ifp->if_u1.if_extents[idx],
1255 &ifp->if_u1.if_extents[idx + ext_diff],
1256 (nextents - (idx + ext_diff)) *
1257 sizeof(xfs_bmbt_rec_t));
1258 }
1259 memset(&ifp->if_u1.if_extents[nextents - ext_diff],
1260 0, ext_diff * sizeof(xfs_bmbt_rec_t));
1261 /*
1262 * Reallocate the direct extent list. If the extents
1263 * will fit inside the inode then xfs_iext_realloc_direct
1264 * will switch from direct to inline extent allocation
1265 * mode for us.
1266 */
1267 xfs_iext_realloc_direct(ifp, new_size);
1268 ifp->if_bytes = new_size;
1269}
1270
1271/*
1272 * This is called when incore extents are being removed from the
1273 * indirection array and the extents being removed span multiple extent
1274 * buffers. The idx parameter contains the file extent index where we
1275 * want to begin removing extents, and the count parameter contains
1276 * how many extents need to be removed.
1277 *
1278 * |-------| |-------|
1279 * | nex1 | | | nex1 - number of extents before idx
1280 * |-------| | count |
1281 * | | | | count - number of extents being removed at idx
1282 * | count | |-------|
1283 * | | | nex2 | nex2 - number of extents after idx + count
1284 * |-------| |-------|
1285 */
1286void
1287xfs_iext_remove_indirect(
1288 xfs_ifork_t *ifp, /* inode fork pointer */
1289 xfs_extnum_t idx, /* index to begin removing extents */
1290 int count) /* number of extents to remove */
1291{
1292 xfs_ext_irec_t *erp; /* indirection array pointer */
1293 int erp_idx = 0; /* indirection array index */
1294 xfs_extnum_t ext_cnt; /* extents left to remove */
1295 xfs_extnum_t ext_diff; /* extents to remove in current list */
1296 xfs_extnum_t nex1; /* number of extents before idx */
1297 xfs_extnum_t nex2; /* extents after idx + count */
1298 int page_idx = idx; /* index in target extent list */
1299
1300 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1301 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
1302 ASSERT(erp != NULL);
1303 nex1 = page_idx;
1304 ext_cnt = count;
1305 while (ext_cnt) {
1306 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
1307 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
1308 /*
1309 * Check for deletion of entire list;
1310 * xfs_iext_irec_remove() updates extent offsets.
1311 */
1312 if (ext_diff == erp->er_extcount) {
1313 xfs_iext_irec_remove(ifp, erp_idx);
1314 ext_cnt -= ext_diff;
1315 nex1 = 0;
1316 if (ext_cnt) {
1317 ASSERT(erp_idx < ifp->if_real_bytes /
1318 XFS_IEXT_BUFSZ);
1319 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1320 nex1 = 0;
1321 continue;
1322 } else {
1323 break;
1324 }
1325 }
1326 /* Move extents up (if needed) */
1327 if (nex2) {
1328 memmove(&erp->er_extbuf[nex1],
1329 &erp->er_extbuf[nex1 + ext_diff],
1330 nex2 * sizeof(xfs_bmbt_rec_t));
1331 }
1332 /* Zero out rest of page */
1333 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
1334 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
1335 /* Update remaining counters */
1336 erp->er_extcount -= ext_diff;
1337 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
1338 ext_cnt -= ext_diff;
1339 nex1 = 0;
1340 erp_idx++;
1341 erp++;
1342 }
1343 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
1344 xfs_iext_irec_compact(ifp);
1345}
1346
1347/*
1348 * Create, destroy, or resize a linear (direct) block of extents.
1349 */
1350void
1351xfs_iext_realloc_direct(
1352 xfs_ifork_t *ifp, /* inode fork pointer */
1353 int new_size) /* new size of extents after adding */
1354{
1355 int rnew_size; /* real new size of extents */
1356
1357 rnew_size = new_size;
1358
1359 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
1360 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
1361 (new_size != ifp->if_real_bytes)));
1362
1363 /* Free extent records */
1364 if (new_size == 0) {
1365 xfs_iext_destroy(ifp);
1366 }
1367 /* Resize direct extent list and zero any new bytes */
1368 else if (ifp->if_real_bytes) {
1369 /* Check if extents will fit inside the inode */
1370 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
1371 xfs_iext_direct_to_inline(ifp, new_size /
1372 (uint)sizeof(xfs_bmbt_rec_t));
1373 ifp->if_bytes = new_size;
1374 return;
1375 }
1376 if (!is_power_of_2(new_size)){
1377 rnew_size = roundup_pow_of_two(new_size);
1378 }
1379 if (rnew_size != ifp->if_real_bytes) {
1380 ifp->if_u1.if_extents =
1381 kmem_realloc(ifp->if_u1.if_extents,
1382 rnew_size,
1383 ifp->if_real_bytes, KM_NOFS);
1384 }
1385 if (rnew_size > ifp->if_real_bytes) {
1386 memset(&ifp->if_u1.if_extents[ifp->if_bytes /
1387 (uint)sizeof(xfs_bmbt_rec_t)], 0,
1388 rnew_size - ifp->if_real_bytes);
1389 }
1390 }
1391 /* Switch from the inline extent buffer to a direct extent list */
1392 else {
1393 if (!is_power_of_2(new_size)) {
1394 rnew_size = roundup_pow_of_two(new_size);
1395 }
1396 xfs_iext_inline_to_direct(ifp, rnew_size);
1397 }
1398 ifp->if_real_bytes = rnew_size;
1399 ifp->if_bytes = new_size;
1400}
1401
1402/*
1403 * Switch from linear (direct) extent records to inline buffer.
1404 */
1405void
1406xfs_iext_direct_to_inline(
1407 xfs_ifork_t *ifp, /* inode fork pointer */
1408 xfs_extnum_t nextents) /* number of extents in file */
1409{
1410 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
1411 ASSERT(nextents <= XFS_INLINE_EXTS);
1412 /*
1413 * The inline buffer was zeroed when we switched
1414 * from inline to direct extent allocation mode,
1415 * so we don't need to clear it here.
1416 */
1417 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
1418 nextents * sizeof(xfs_bmbt_rec_t));
1419 kmem_free(ifp->if_u1.if_extents);
1420 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
1421 ifp->if_real_bytes = 0;
1422}
1423
1424/*
1425 * Switch from inline buffer to linear (direct) extent records.
1426 * new_size should already be rounded up to the next power of 2
1427 * by the caller (when appropriate), so use new_size as it is.
1428 * However, since new_size may be rounded up, we can't update
1429 * if_bytes here. It is the caller's responsibility to update
1430 * if_bytes upon return.
1431 */
1432void
1433xfs_iext_inline_to_direct(
1434 xfs_ifork_t *ifp, /* inode fork pointer */
1435 int new_size) /* number of extents in file */
1436{
1437 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
1438 memset(ifp->if_u1.if_extents, 0, new_size);
1439 if (ifp->if_bytes) {
1440 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
1441 ifp->if_bytes);
1442 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
1443 sizeof(xfs_bmbt_rec_t));
1444 }
1445 ifp->if_real_bytes = new_size;
1446}
1447
1448/*
1449 * Resize an extent indirection array to new_size bytes.
1450 */
1451STATIC void
1452xfs_iext_realloc_indirect(
1453 xfs_ifork_t *ifp, /* inode fork pointer */
1454 int new_size) /* new indirection array size */
1455{
1456 int nlists; /* number of irec's (ex lists) */
1457 int size; /* current indirection array size */
1458
1459 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1460 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1461 size = nlists * sizeof(xfs_ext_irec_t);
1462 ASSERT(ifp->if_real_bytes);
1463 ASSERT((new_size >= 0) && (new_size != size));
1464 if (new_size == 0) {
1465 xfs_iext_destroy(ifp);
1466 } else {
1467 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
1468 kmem_realloc(ifp->if_u1.if_ext_irec,
1469 new_size, size, KM_NOFS);
1470 }
1471}
1472
1473/*
1474 * Switch from indirection array to linear (direct) extent allocations.
1475 */
1476STATIC void
1477xfs_iext_indirect_to_direct(
1478 xfs_ifork_t *ifp) /* inode fork pointer */
1479{
1480 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
1481 xfs_extnum_t nextents; /* number of extents in file */
1482 int size; /* size of file extents */
1483
1484 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1485 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1486 ASSERT(nextents <= XFS_LINEAR_EXTS);
1487 size = nextents * sizeof(xfs_bmbt_rec_t);
1488
1489 xfs_iext_irec_compact_pages(ifp);
1490 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
1491
1492 ep = ifp->if_u1.if_ext_irec->er_extbuf;
1493 kmem_free(ifp->if_u1.if_ext_irec);
1494 ifp->if_flags &= ~XFS_IFEXTIREC;
1495 ifp->if_u1.if_extents = ep;
1496 ifp->if_bytes = size;
1497 if (nextents < XFS_LINEAR_EXTS) {
1498 xfs_iext_realloc_direct(ifp, size);
1499 }
1500}
1501
1502/*
1503 * Free incore file extents.
1504 */
1505void
1506xfs_iext_destroy(
1507 xfs_ifork_t *ifp) /* inode fork pointer */
1508{
1509 if (ifp->if_flags & XFS_IFEXTIREC) {
1510 int erp_idx;
1511 int nlists;
1512
1513 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1514 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
1515 xfs_iext_irec_remove(ifp, erp_idx);
1516 }
1517 ifp->if_flags &= ~XFS_IFEXTIREC;
1518 } else if (ifp->if_real_bytes) {
1519 kmem_free(ifp->if_u1.if_extents);
1520 } else if (ifp->if_bytes) {
1521 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
1522 sizeof(xfs_bmbt_rec_t));
1523 }
1524 ifp->if_u1.if_extents = NULL;
1525 ifp->if_real_bytes = 0;
1526 ifp->if_bytes = 0;
1527}
1528
1529/*
1530 * Return a pointer to the extent record for file system block bno.
1531 */
1532xfs_bmbt_rec_host_t * /* pointer to found extent record */
1533xfs_iext_bno_to_ext(
1534 xfs_ifork_t *ifp, /* inode fork pointer */
1535 xfs_fileoff_t bno, /* block number to search for */
1536 xfs_extnum_t *idxp) /* index of target extent */
1537{
1538 xfs_bmbt_rec_host_t *base; /* pointer to first extent */
1539 xfs_filblks_t blockcount = 0; /* number of blocks in extent */
1540 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
1541 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
1542 int high; /* upper boundary in search */
1543 xfs_extnum_t idx = 0; /* index of target extent */
1544 int low; /* lower boundary in search */
1545 xfs_extnum_t nextents; /* number of file extents */
1546 xfs_fileoff_t startoff = 0; /* start offset of extent */
1547
1548 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1549 if (nextents == 0) {
1550 *idxp = 0;
1551 return NULL;
1552 }
1553 low = 0;
1554 if (ifp->if_flags & XFS_IFEXTIREC) {
1555 /* Find target extent list */
1556 int erp_idx = 0;
1557 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
1558 base = erp->er_extbuf;
1559 high = erp->er_extcount - 1;
1560 } else {
1561 base = ifp->if_u1.if_extents;
1562 high = nextents - 1;
1563 }
1564 /* Binary search extent records */
1565 while (low <= high) {
1566 idx = (low + high) >> 1;
1567 ep = base + idx;
1568 startoff = xfs_bmbt_get_startoff(ep);
1569 blockcount = xfs_bmbt_get_blockcount(ep);
1570 if (bno < startoff) {
1571 high = idx - 1;
1572 } else if (bno >= startoff + blockcount) {
1573 low = idx + 1;
1574 } else {
1575 /* Convert back to file-based extent index */
1576 if (ifp->if_flags & XFS_IFEXTIREC) {
1577 idx += erp->er_extoff;
1578 }
1579 *idxp = idx;
1580 return ep;
1581 }
1582 }
1583 /* Convert back to file-based extent index */
1584 if (ifp->if_flags & XFS_IFEXTIREC) {
1585 idx += erp->er_extoff;
1586 }
1587 if (bno >= startoff + blockcount) {
1588 if (++idx == nextents) {
1589 ep = NULL;
1590 } else {
1591 ep = xfs_iext_get_ext(ifp, idx);
1592 }
1593 }
1594 *idxp = idx;
1595 return ep;
1596}
1597
1598/*
1599 * Return a pointer to the indirection array entry containing the
1600 * extent record for filesystem block bno. Store the index of the
1601 * target irec in *erp_idxp.
1602 */
1603xfs_ext_irec_t * /* pointer to found extent record */
1604xfs_iext_bno_to_irec(
1605 xfs_ifork_t *ifp, /* inode fork pointer */
1606 xfs_fileoff_t bno, /* block number to search for */
1607 int *erp_idxp) /* irec index of target ext list */
1608{
1609 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
1610 xfs_ext_irec_t *erp_next; /* next indirection array entry */
1611 int erp_idx; /* indirection array index */
1612 int nlists; /* number of extent irec's (lists) */
1613 int high; /* binary search upper limit */
1614 int low; /* binary search lower limit */
1615
1616 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1617 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1618 erp_idx = 0;
1619 low = 0;
1620 high = nlists - 1;
1621 while (low <= high) {
1622 erp_idx = (low + high) >> 1;
1623 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1624 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
1625 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
1626 high = erp_idx - 1;
1627 } else if (erp_next && bno >=
1628 xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
1629 low = erp_idx + 1;
1630 } else {
1631 break;
1632 }
1633 }
1634 *erp_idxp = erp_idx;
1635 return erp;
1636}
1637
1638/*
1639 * Return a pointer to the indirection array entry containing the
1640 * extent record at file extent index *idxp. Store the index of the
1641 * target irec in *erp_idxp and store the page index of the target
1642 * extent record in *idxp.
1643 */
1644xfs_ext_irec_t *
1645xfs_iext_idx_to_irec(
1646 xfs_ifork_t *ifp, /* inode fork pointer */
1647 xfs_extnum_t *idxp, /* extent index (file -> page) */
1648 int *erp_idxp, /* pointer to target irec */
1649 int realloc) /* new bytes were just added */
1650{
1651 xfs_ext_irec_t *prev; /* pointer to previous irec */
1652 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
1653 int erp_idx; /* indirection array index */
1654 int nlists; /* number of irec's (ex lists) */
1655 int high; /* binary search upper limit */
1656 int low; /* binary search lower limit */
1657 xfs_extnum_t page_idx = *idxp; /* extent index in target list */
1658
1659 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1660 ASSERT(page_idx >= 0);
1661 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
1662 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
1663
1664 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1665 erp_idx = 0;
1666 low = 0;
1667 high = nlists - 1;
1668
1669 /* Binary search extent irec's */
1670 while (low <= high) {
1671 erp_idx = (low + high) >> 1;
1672 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1673 prev = erp_idx > 0 ? erp - 1 : NULL;
1674 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
1675 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
1676 high = erp_idx - 1;
1677 } else if (page_idx > erp->er_extoff + erp->er_extcount ||
1678 (page_idx == erp->er_extoff + erp->er_extcount &&
1679 !realloc)) {
1680 low = erp_idx + 1;
1681 } else if (page_idx == erp->er_extoff + erp->er_extcount &&
1682 erp->er_extcount == XFS_LINEAR_EXTS) {
1683 ASSERT(realloc);
1684 page_idx = 0;
1685 erp_idx++;
1686 erp = erp_idx < nlists ? erp + 1 : NULL;
1687 break;
1688 } else {
1689 page_idx -= erp->er_extoff;
1690 break;
1691 }
1692 }
1693 *idxp = page_idx;
1694 *erp_idxp = erp_idx;
1695 return erp;
1696}
1697
1698/*
1699 * Allocate and initialize an indirection array once the space needed
1700 * for incore extents increases above XFS_IEXT_BUFSZ.
1701 */
1702void
1703xfs_iext_irec_init(
1704 xfs_ifork_t *ifp) /* inode fork pointer */
1705{
1706 xfs_ext_irec_t *erp; /* indirection array pointer */
1707 xfs_extnum_t nextents; /* number of extents in file */
1708
1709 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
1710 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1711 ASSERT(nextents <= XFS_LINEAR_EXTS);
1712
1713 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
1714
1715 if (nextents == 0) {
1716 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
1717 } else if (!ifp->if_real_bytes) {
1718 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
1719 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
1720 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
1721 }
1722 erp->er_extbuf = ifp->if_u1.if_extents;
1723 erp->er_extcount = nextents;
1724 erp->er_extoff = 0;
1725
1726 ifp->if_flags |= XFS_IFEXTIREC;
1727 ifp->if_real_bytes = XFS_IEXT_BUFSZ;
1728 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
1729 ifp->if_u1.if_ext_irec = erp;
1730
1731 return;
1732}
1733
1734/*
1735 * Allocate and initialize a new entry in the indirection array.
1736 */
1737xfs_ext_irec_t *
1738xfs_iext_irec_new(
1739 xfs_ifork_t *ifp, /* inode fork pointer */
1740 int erp_idx) /* index for new irec */
1741{
1742 xfs_ext_irec_t *erp; /* indirection array pointer */
1743 int i; /* loop counter */
1744 int nlists; /* number of irec's (ex lists) */
1745
1746 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1747 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1748
1749 /* Resize indirection array */
1750 xfs_iext_realloc_indirect(ifp, ++nlists *
1751 sizeof(xfs_ext_irec_t));
1752 /*
1753 * Move records down in the array so the
1754 * new page can use erp_idx.
1755 */
1756 erp = ifp->if_u1.if_ext_irec;
1757 for (i = nlists - 1; i > erp_idx; i--) {
1758 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
1759 }
1760 ASSERT(i == erp_idx);
1761
1762 /* Initialize new extent record */
1763 erp = ifp->if_u1.if_ext_irec;
1764 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
1765 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
1766 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
1767 erp[erp_idx].er_extcount = 0;
1768 erp[erp_idx].er_extoff = erp_idx > 0 ?
1769 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
1770 return (&erp[erp_idx]);
1771}
1772
1773/*
1774 * Remove a record from the indirection array.
1775 */
1776void
1777xfs_iext_irec_remove(
1778 xfs_ifork_t *ifp, /* inode fork pointer */
1779 int erp_idx) /* irec index to remove */
1780{
1781 xfs_ext_irec_t *erp; /* indirection array pointer */
1782 int i; /* loop counter */
1783 int nlists; /* number of irec's (ex lists) */
1784
1785 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1786 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1787 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1788 if (erp->er_extbuf) {
1789 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
1790 -erp->er_extcount);
1791 kmem_free(erp->er_extbuf);
1792 }
1793 /* Compact extent records */
1794 erp = ifp->if_u1.if_ext_irec;
1795 for (i = erp_idx; i < nlists - 1; i++) {
1796 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
1797 }
1798 /*
1799 * Manually free the last extent record from the indirection
1800 * array. A call to xfs_iext_realloc_indirect() with a size
1801 * of zero would result in a call to xfs_iext_destroy() which
1802 * would in turn call this function again, creating a nasty
1803 * infinite loop.
1804 */
1805 if (--nlists) {
1806 xfs_iext_realloc_indirect(ifp,
1807 nlists * sizeof(xfs_ext_irec_t));
1808 } else {
1809 kmem_free(ifp->if_u1.if_ext_irec);
1810 }
1811 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
1812}
1813
1814/*
1815 * This is called to clean up large amounts of unused memory allocated
1816 * by the indirection array. Before compacting anything though, verify
1817 * that the indirection array is still needed and switch back to the
1818 * linear extent list (or even the inline buffer) if possible. The
1819 * compaction policy is as follows:
1820 *
1821 * Full Compaction: Extents fit into a single page (or inline buffer)
1822 * Partial Compaction: Extents occupy less than 50% of allocated space
1823 * No Compaction: Extents occupy at least 50% of allocated space
1824 */
1825void
1826xfs_iext_irec_compact(
1827 xfs_ifork_t *ifp) /* inode fork pointer */
1828{
1829 xfs_extnum_t nextents; /* number of extents in file */
1830 int nlists; /* number of irec's (ex lists) */
1831
1832 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1833 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1834 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1835
1836 if (nextents == 0) {
1837 xfs_iext_destroy(ifp);
1838 } else if (nextents <= XFS_INLINE_EXTS) {
1839 xfs_iext_indirect_to_direct(ifp);
1840 xfs_iext_direct_to_inline(ifp, nextents);
1841 } else if (nextents <= XFS_LINEAR_EXTS) {
1842 xfs_iext_indirect_to_direct(ifp);
1843 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
1844 xfs_iext_irec_compact_pages(ifp);
1845 }
1846}
1847
1848/*
1849 * Combine extents from neighboring extent pages.
1850 */
1851void
1852xfs_iext_irec_compact_pages(
1853 xfs_ifork_t *ifp) /* inode fork pointer */
1854{
1855 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
1856 int erp_idx = 0; /* indirection array index */
1857 int nlists; /* number of irec's (ex lists) */
1858
1859 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1860 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1861 while (erp_idx < nlists - 1) {
1862 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1863 erp_next = erp + 1;
1864 if (erp_next->er_extcount <=
1865 (XFS_LINEAR_EXTS - erp->er_extcount)) {
1866 memcpy(&erp->er_extbuf[erp->er_extcount],
1867 erp_next->er_extbuf, erp_next->er_extcount *
1868 sizeof(xfs_bmbt_rec_t));
1869 erp->er_extcount += erp_next->er_extcount;
1870 /*
1871 * Free page before removing extent record
1872 * so er_extoffs don't get modified in
1873 * xfs_iext_irec_remove.
1874 */
1875 kmem_free(erp_next->er_extbuf);
1876 erp_next->er_extbuf = NULL;
1877 xfs_iext_irec_remove(ifp, erp_idx + 1);
1878 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1879 } else {
1880 erp_idx++;
1881 }
1882 }
1883}
1884
1885/*
1886 * This is called to update the er_extoff field in the indirection
1887 * array when extents have been added or removed from one of the
1888 * extent lists. erp_idx contains the irec index to begin updating
1889 * at and ext_diff contains the number of extents that were added
1890 * or removed.
1891 */
1892void
1893xfs_iext_irec_update_extoffs(
1894 xfs_ifork_t *ifp, /* inode fork pointer */
1895 int erp_idx, /* irec index to update */
1896 int ext_diff) /* number of new extents */
1897{
1898 int i; /* loop counter */
1899 int nlists; /* number of irec's (ex lists */
1900
1901 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1902 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1903 for (i = erp_idx; i < nlists; i++) {
1904 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
1905 }
1906}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
new file mode 100644
index 000000000000..7d3b1ed6dcbe
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -0,0 +1,171 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_INODE_FORK_H__
19#define __XFS_INODE_FORK_H__
20
21struct xfs_inode_log_item;
22struct xfs_dinode;
23
24/*
25 * The following xfs_ext_irec_t struct introduces a second (top) level
26 * to the in-core extent allocation scheme. These structs are allocated
27 * in a contiguous block, creating an indirection array where each entry
28 * (irec) contains a pointer to a buffer of in-core extent records which
29 * it manages. Each extent buffer is 4k in size, since 4k is the system
30 * page size on Linux i386 and systems with larger page sizes don't seem
31 * to gain much, if anything, by using their native page size as the
32 * extent buffer size. Also, using 4k extent buffers everywhere provides
33 * a consistent interface for CXFS across different platforms.
34 *
35 * There is currently no limit on the number of irec's (extent lists)
36 * allowed, so heavily fragmented files may require an indirection array
37 * which spans multiple system pages of memory. The number of extents
38 * which would require this amount of contiguous memory is very large
39 * and should not cause problems in the foreseeable future. However,
40 * if the memory needed for the contiguous array ever becomes a problem,
41 * it is possible that a third level of indirection may be required.
42 */
43typedef struct xfs_ext_irec {
44 xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
45 xfs_extnum_t er_extoff; /* extent offset in file */
46 xfs_extnum_t er_extcount; /* number of extents in page/block */
47} xfs_ext_irec_t;
48
49/*
50 * File incore extent information, present for each of data & attr forks.
51 */
52#define XFS_IEXT_BUFSZ 4096
53#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
54#define XFS_INLINE_EXTS 2
55#define XFS_INLINE_DATA 32
56typedef struct xfs_ifork {
57 int if_bytes; /* bytes in if_u1 */
58 int if_real_bytes; /* bytes allocated in if_u1 */
59 struct xfs_btree_block *if_broot; /* file's incore btree root */
60 short if_broot_bytes; /* bytes allocated for root */
61 unsigned char if_flags; /* per-fork flags */
62 union {
63 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
64 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
65 char *if_data; /* inline file data */
66 } if_u1;
67 union {
68 xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
69 /* very small file extents */
70 char if_inline_data[XFS_INLINE_DATA];
71 /* very small file data */
72 xfs_dev_t if_rdev; /* dev number if special */
73 uuid_t if_uuid; /* mount point value */
74 } if_u2;
75} xfs_ifork_t;
76
77/*
78 * Per-fork incore inode flags.
79 */
80#define XFS_IFINLINE 0x01 /* Inline data is read in */
81#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
82#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
83#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
84
85/*
86 * Fork handling.
87 */
88
89#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
90#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
91
92#define XFS_IFORK_PTR(ip,w) \
93 ((w) == XFS_DATA_FORK ? \
94 &(ip)->i_df : \
95 (ip)->i_afp)
96#define XFS_IFORK_DSIZE(ip) \
97 (XFS_IFORK_Q(ip) ? \
98 XFS_IFORK_BOFF(ip) : \
99 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
100#define XFS_IFORK_ASIZE(ip) \
101 (XFS_IFORK_Q(ip) ? \
102 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
103 XFS_IFORK_BOFF(ip) : \
104 0)
105#define XFS_IFORK_SIZE(ip,w) \
106 ((w) == XFS_DATA_FORK ? \
107 XFS_IFORK_DSIZE(ip) : \
108 XFS_IFORK_ASIZE(ip))
109#define XFS_IFORK_FORMAT(ip,w) \
110 ((w) == XFS_DATA_FORK ? \
111 (ip)->i_d.di_format : \
112 (ip)->i_d.di_aformat)
113#define XFS_IFORK_FMT_SET(ip,w,n) \
114 ((w) == XFS_DATA_FORK ? \
115 ((ip)->i_d.di_format = (n)) : \
116 ((ip)->i_d.di_aformat = (n)))
117#define XFS_IFORK_NEXTENTS(ip,w) \
118 ((w) == XFS_DATA_FORK ? \
119 (ip)->i_d.di_nextents : \
120 (ip)->i_d.di_anextents)
121#define XFS_IFORK_NEXT_SET(ip,w,n) \
122 ((w) == XFS_DATA_FORK ? \
123 ((ip)->i_d.di_nextents = (n)) : \
124 ((ip)->i_d.di_anextents = (n)))
125#define XFS_IFORK_MAXEXT(ip, w) \
126 (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
127
128int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
129void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
130 struct xfs_inode_log_item *, int);
131void xfs_idestroy_fork(struct xfs_inode *, int);
132void xfs_idata_realloc(struct xfs_inode *, int, int);
133void xfs_iroot_realloc(struct xfs_inode *, int, int);
134int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
135int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
136 int);
137
138struct xfs_bmbt_rec_host *
139 xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
140void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
141 struct xfs_bmbt_irec *, int);
142void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
143void xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
144 xfs_extnum_t, int);
145void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
146void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
147void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
148void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
149void xfs_iext_realloc_direct(struct xfs_ifork *, int);
150void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
151void xfs_iext_inline_to_direct(struct xfs_ifork *, int);
152void xfs_iext_destroy(struct xfs_ifork *);
153struct xfs_bmbt_rec_host *
154 xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
155struct xfs_ext_irec *
156 xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
157struct xfs_ext_irec *
158 xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
159 int);
160void xfs_iext_irec_init(struct xfs_ifork *);
161struct xfs_ext_irec *
162 xfs_iext_irec_new(struct xfs_ifork *, int);
163void xfs_iext_irec_remove(struct xfs_ifork *, int);
164void xfs_iext_irec_compact(struct xfs_ifork *);
165void xfs_iext_irec_compact_pages(struct xfs_ifork *);
166void xfs_iext_irec_compact_full(struct xfs_ifork *);
167void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
168
169extern struct kmem_zone *xfs_ifork_zone;
170
171#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
new file mode 100644
index 000000000000..90efdaf1706f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inum.h
@@ -0,0 +1,64 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_INUM_H__
19#define __XFS_INUM_H__
20
21/*
22 * Inode number format:
23 * low inopblog bits - offset in block
24 * next agblklog bits - block number in ag
25 * next agno_log bits - ag number
26 * high agno_log-agblklog-inopblog bits - 0
27 */
28
29struct xfs_mount;
30
31#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
32#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
33#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
34#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log
35#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log
36#define XFS_INO_BITS(mp) \
37 XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
38#define XFS_INO_TO_AGNO(mp,i) \
39 ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
40#define XFS_INO_TO_AGINO(mp,i) \
41 ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
42#define XFS_INO_TO_AGBNO(mp,i) \
43 (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
44 XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
45#define XFS_INO_TO_OFFSET(mp,i) \
46 ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
47#define XFS_INO_TO_FSB(mp,i) \
48 XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
49#define XFS_AGINO_TO_INO(mp,a,i) \
50 (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
51#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp))
52#define XFS_AGINO_TO_OFFSET(mp,i) \
53 ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
54#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
55 ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
56
57#if XFS_BIG_INUMS
58#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
59#else
60#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
61#endif
62#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
63
64#endif /* __XFS_INUM_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
new file mode 100644
index 000000000000..f0969c77bdbe
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -0,0 +1,679 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_LOG_FORMAT_H__
19#define __XFS_LOG_FORMAT_H__
20
21struct xfs_mount;
22struct xfs_trans_res;
23
24/*
25 * On-disk Log Format definitions.
26 *
27 * This file contains all the on-disk format definitions used within the log. It
28 * includes the physical log structure itself, as well as all the log item
29 * format structures that are written into the log and intepreted by log
30 * recovery. We start with the physical log format definitions, and then work
31 * through all the log items definitions and everything they encode into the
32 * log.
33 */
34typedef __uint32_t xlog_tid_t;
35
36#define XLOG_MIN_ICLOGS 2
37#define XLOG_MAX_ICLOGS 8
38#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */
39#define XLOG_VERSION_1 1
40#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
41#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
42#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
43#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
44#define XLOG_MAX_RECORD_BSIZE (256*1024)
45#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
46#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
47#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
48#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
49#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
50 (log)->l_mp->m_sb.sb_logsunit)
51#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
52
53#define XLOG_HEADER_SIZE 512
54
55/* Minimum number of transactions that must fit in the log (defined by mkfs) */
56#define XFS_MIN_LOG_FACTOR 3
57
58#define XLOG_REC_SHIFT(log) \
59 BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
60 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
61#define XLOG_TOTAL_REC_SHIFT(log) \
62 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
63 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
64
65/* get lsn fields */
66#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
67#define BLOCK_LSN(lsn) ((uint)(lsn))
68
69/* this is used in a spot where we might otherwise double-endian-flip */
70#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
71
72static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
73{
74 return ((xfs_lsn_t)cycle << 32) | block;
75}
76
77static inline uint xlog_get_cycle(char *ptr)
78{
79 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
80 return be32_to_cpu(*((__be32 *)ptr + 1));
81 else
82 return be32_to_cpu(*(__be32 *)ptr);
83}
84
85/* Log Clients */
86#define XFS_TRANSACTION 0x69
87#define XFS_VOLUME 0x2
88#define XFS_LOG 0xaa
89
90#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
91
92/* Region types for iovec's i_type */
93#define XLOG_REG_TYPE_BFORMAT 1
94#define XLOG_REG_TYPE_BCHUNK 2
95#define XLOG_REG_TYPE_EFI_FORMAT 3
96#define XLOG_REG_TYPE_EFD_FORMAT 4
97#define XLOG_REG_TYPE_IFORMAT 5
98#define XLOG_REG_TYPE_ICORE 6
99#define XLOG_REG_TYPE_IEXT 7
100#define XLOG_REG_TYPE_IBROOT 8
101#define XLOG_REG_TYPE_ILOCAL 9
102#define XLOG_REG_TYPE_IATTR_EXT 10
103#define XLOG_REG_TYPE_IATTR_BROOT 11
104#define XLOG_REG_TYPE_IATTR_LOCAL 12
105#define XLOG_REG_TYPE_QFORMAT 13
106#define XLOG_REG_TYPE_DQUOT 14
107#define XLOG_REG_TYPE_QUOTAOFF 15
108#define XLOG_REG_TYPE_LRHEADER 16
109#define XLOG_REG_TYPE_UNMOUNT 17
110#define XLOG_REG_TYPE_COMMIT 18
111#define XLOG_REG_TYPE_TRANSHDR 19
112#define XLOG_REG_TYPE_ICREATE 20
113#define XLOG_REG_TYPE_MAX 20
114
115/*
116 * Flags to log operation header
117 *
118 * The first write of a new transaction will be preceded with a start
119 * record, XLOG_START_TRANS. Once a transaction is committed, a commit
120 * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
121 * the remainder of the current active in-core log, it is split up into
122 * multiple regions. Each partial region will be marked with a
123 * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
124 *
125 */
126#define XLOG_START_TRANS 0x01 /* Start a new transaction */
127#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
128#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
129#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
130#define XLOG_END_TRANS 0x10 /* End a continued transaction */
131#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
132
133
134typedef struct xlog_op_header {
135 __be32 oh_tid; /* transaction id of operation : 4 b */
136 __be32 oh_len; /* bytes in data region : 4 b */
137 __u8 oh_clientid; /* who sent me this : 1 b */
138 __u8 oh_flags; /* : 1 b */
139 __u16 oh_res2; /* 32 bit align : 2 b */
140} xlog_op_header_t;
141
142/* valid values for h_fmt */
143#define XLOG_FMT_UNKNOWN 0
144#define XLOG_FMT_LINUX_LE 1
145#define XLOG_FMT_LINUX_BE 2
146#define XLOG_FMT_IRIX_BE 3
147
148/* our fmt */
149#ifdef XFS_NATIVE_HOST
150#define XLOG_FMT XLOG_FMT_LINUX_BE
151#else
152#define XLOG_FMT XLOG_FMT_LINUX_LE
153#endif
154
155typedef struct xlog_rec_header {
156 __be32 h_magicno; /* log record (LR) identifier : 4 */
157 __be32 h_cycle; /* write cycle of log : 4 */
158 __be32 h_version; /* LR version : 4 */
159 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
160 __be64 h_lsn; /* lsn of this LR : 8 */
161 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
162 __le32 h_crc; /* crc of log record : 4 */
163 __be32 h_prev_block; /* block number to previous LR : 4 */
164 __be32 h_num_logops; /* number of log operations in this LR : 4 */
165 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
166 /* new fields */
167 __be32 h_fmt; /* format of log record : 4 */
168 uuid_t h_fs_uuid; /* uuid of FS : 16 */
169 __be32 h_size; /* iclog size : 4 */
170} xlog_rec_header_t;
171
172typedef struct xlog_rec_ext_header {
173 __be32 xh_cycle; /* write cycle of log : 4 */
174 __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
175} xlog_rec_ext_header_t;
176
177/*
178 * Quite misnamed, because this union lays out the actual on-disk log buffer.
179 */
180typedef union xlog_in_core2 {
181 xlog_rec_header_t hic_header;
182 xlog_rec_ext_header_t hic_xheader;
183 char hic_sector[XLOG_HEADER_SIZE];
184} xlog_in_core_2_t;
185
186/* not an on-disk structure, but needed by log recovery in userspace */
187typedef struct xfs_log_iovec {
188 void *i_addr; /* beginning address of region */
189 int i_len; /* length in bytes of region */
190 uint i_type; /* type of region */
191} xfs_log_iovec_t;
192
193
194/*
195 * Transaction Header definitions.
196 *
197 * This is the structure written in the log at the head of every transaction. It
198 * identifies the type and id of the transaction, and contains the number of
199 * items logged by the transaction so we know how many to expect during
200 * recovery.
201 *
202 * Do not change the below structure without redoing the code in
203 * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
204 */
205typedef struct xfs_trans_header {
206 uint th_magic; /* magic number */
207 uint th_type; /* transaction type */
208 __int32_t th_tid; /* transaction id (unused) */
209 uint th_num_items; /* num items logged by trans */
210} xfs_trans_header_t;
211
212#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
213
214/*
215 * Log item types.
216 */
217#define XFS_LI_EFI 0x1236
218#define XFS_LI_EFD 0x1237
219#define XFS_LI_IUNLINK 0x1238
220#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
221#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
222#define XFS_LI_DQUOT 0x123d
223#define XFS_LI_QUOTAOFF 0x123e
224#define XFS_LI_ICREATE 0x123f
225
226#define XFS_LI_TYPE_DESC \
227 { XFS_LI_EFI, "XFS_LI_EFI" }, \
228 { XFS_LI_EFD, "XFS_LI_EFD" }, \
229 { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
230 { XFS_LI_INODE, "XFS_LI_INODE" }, \
231 { XFS_LI_BUF, "XFS_LI_BUF" }, \
232 { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
233 { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \
234 { XFS_LI_ICREATE, "XFS_LI_ICREATE" }
235
236/*
237 * Inode Log Item Format definitions.
238 *
239 * This is the structure used to lay out an inode log item in the
240 * log. The size of the inline data/extents/b-tree root to be logged
241 * (if any) is indicated in the ilf_dsize field. Changes to this structure
242 * must be added on to the end.
243 */
244typedef struct xfs_inode_log_format {
245 __uint16_t ilf_type; /* inode log item type */
246 __uint16_t ilf_size; /* size of this item */
247 __uint32_t ilf_fields; /* flags for fields logged */
248 __uint16_t ilf_asize; /* size of attr d/ext/root */
249 __uint16_t ilf_dsize; /* size of data/ext/root */
250 __uint64_t ilf_ino; /* inode number */
251 union {
252 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
253 uuid_t ilfu_uuid; /* mount point value */
254 } ilf_u;
255 __int64_t ilf_blkno; /* blkno of inode buffer */
256 __int32_t ilf_len; /* len of inode buffer */
257 __int32_t ilf_boffset; /* off of inode in buffer */
258} xfs_inode_log_format_t;
259
260typedef struct xfs_inode_log_format_32 {
261 __uint16_t ilf_type; /* inode log item type */
262 __uint16_t ilf_size; /* size of this item */
263 __uint32_t ilf_fields; /* flags for fields logged */
264 __uint16_t ilf_asize; /* size of attr d/ext/root */
265 __uint16_t ilf_dsize; /* size of data/ext/root */
266 __uint64_t ilf_ino; /* inode number */
267 union {
268 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
269 uuid_t ilfu_uuid; /* mount point value */
270 } ilf_u;
271 __int64_t ilf_blkno; /* blkno of inode buffer */
272 __int32_t ilf_len; /* len of inode buffer */
273 __int32_t ilf_boffset; /* off of inode in buffer */
274} __attribute__((packed)) xfs_inode_log_format_32_t;
275
276typedef struct xfs_inode_log_format_64 {
277 __uint16_t ilf_type; /* inode log item type */
278 __uint16_t ilf_size; /* size of this item */
279 __uint32_t ilf_fields; /* flags for fields logged */
280 __uint16_t ilf_asize; /* size of attr d/ext/root */
281 __uint16_t ilf_dsize; /* size of data/ext/root */
282 __uint32_t ilf_pad; /* pad for 64 bit boundary */
283 __uint64_t ilf_ino; /* inode number */
284 union {
285 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
286 uuid_t ilfu_uuid; /* mount point value */
287 } ilf_u;
288 __int64_t ilf_blkno; /* blkno of inode buffer */
289 __int32_t ilf_len; /* len of inode buffer */
290 __int32_t ilf_boffset; /* off of inode in buffer */
291} xfs_inode_log_format_64_t;
292
293/*
294 * Flags for xfs_trans_log_inode flags field.
295 */
296#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
297#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
298#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
299#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
300#define XFS_ILOG_DEV 0x010 /* log the dev field */
301#define XFS_ILOG_UUID 0x020 /* log the uuid field */
302#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
303#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
304#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
305#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
306#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
307
308
309/*
310 * The timestamps are dirty, but not necessarily anything else in the inode
311 * core. Unlike the other fields above this one must never make it to disk
312 * in the ilf_fields of the inode_log_format, but is purely store in-memory in
313 * ili_fields in the inode_log_item.
314 */
315#define XFS_ILOG_TIMESTAMP 0x4000
316
317#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
318 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
319 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
320 XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
321 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
322
323#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
324 XFS_ILOG_DBROOT)
325
326#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
327 XFS_ILOG_ABROOT)
328
329#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
330 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
331 XFS_ILOG_DEV | XFS_ILOG_UUID | \
332 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
333 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
334 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
335
336static inline int xfs_ilog_fbroot(int w)
337{
338 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
339}
340
341static inline int xfs_ilog_fext(int w)
342{
343 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
344}
345
346static inline int xfs_ilog_fdata(int w)
347{
348 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
349}
350
351/*
352 * Incore version of the on-disk inode core structures. We log this directly
353 * into the journal in host CPU format (for better or worse) and as such
354 * directly mirrors the xfs_dinode structure as it must contain all the same
355 * information.
356 */
357typedef struct xfs_ictimestamp {
358 __int32_t t_sec; /* timestamp seconds */
359 __int32_t t_nsec; /* timestamp nanoseconds */
360} xfs_ictimestamp_t;
361
362/*
363 * NOTE: This structure must be kept identical to struct xfs_dinode
364 * in xfs_dinode.h except for the endianness annotations.
365 */
366typedef struct xfs_icdinode {
367 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
368 __uint16_t di_mode; /* mode and type of file */
369 __int8_t di_version; /* inode version */
370 __int8_t di_format; /* format of di_c data */
371 __uint16_t di_onlink; /* old number of links to file */
372 __uint32_t di_uid; /* owner's user id */
373 __uint32_t di_gid; /* owner's group id */
374 __uint32_t di_nlink; /* number of links to file */
375 __uint16_t di_projid_lo; /* lower part of owner's project id */
376 __uint16_t di_projid_hi; /* higher part of owner's project id */
377 __uint8_t di_pad[6]; /* unused, zeroed space */
378 __uint16_t di_flushiter; /* incremented on flush */
379 xfs_ictimestamp_t di_atime; /* time last accessed */
380 xfs_ictimestamp_t di_mtime; /* time last modified */
381 xfs_ictimestamp_t di_ctime; /* time created/inode modified */
382 xfs_fsize_t di_size; /* number of bytes in file */
383 xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
384 xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
385 xfs_extnum_t di_nextents; /* number of extents in data fork */
386 xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
387 __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
388 __int8_t di_aformat; /* format of attr fork's data */
389 __uint32_t di_dmevmask; /* DMIG event mask */
390 __uint16_t di_dmstate; /* DMIG state info */
391 __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
392 __uint32_t di_gen; /* generation number */
393
394 /* di_next_unlinked is the only non-core field in the old dinode */
395 xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
396
397 /* start of the extended dinode, writable fields */
398 __uint32_t di_crc; /* CRC of the inode */
399 __uint64_t di_changecount; /* number of attribute changes */
400 xfs_lsn_t di_lsn; /* flush sequence */
401 __uint64_t di_flags2; /* more random flags */
402 __uint8_t di_pad2[16]; /* more padding for future expansion */
403
404 /* fields only written to during inode creation */
405 xfs_ictimestamp_t di_crtime; /* time created */
406 xfs_ino_t di_ino; /* inode number */
407 uuid_t di_uuid; /* UUID of the filesystem */
408
409 /* structure must be padded to 64 bit alignment */
410} xfs_icdinode_t;
411
412static inline uint xfs_icdinode_size(int version)
413{
414 if (version == 3)
415 return sizeof(struct xfs_icdinode);
416 return offsetof(struct xfs_icdinode, di_next_unlinked);
417}
418
419/*
420 * Buffer Log Format defintions
421 *
422 * These are the physical dirty bitmap defintions for the log format structure.
423 */
424#define XFS_BLF_CHUNK 128
425#define XFS_BLF_SHIFT 7
426#define BIT_TO_WORD_SHIFT 5
427#define NBWORD (NBBY * sizeof(unsigned int))
428
429/*
430 * This flag indicates that the buffer contains on disk inodes
431 * and requires special recovery handling.
432 */
433#define XFS_BLF_INODE_BUF (1<<0)
434
435/*
436 * This flag indicates that the buffer should not be replayed
437 * during recovery because its blocks are being freed.
438 */
439#define XFS_BLF_CANCEL (1<<1)
440
441/*
442 * This flag indicates that the buffer contains on disk
443 * user or group dquots and may require special recovery handling.
444 */
445#define XFS_BLF_UDQUOT_BUF (1<<2)
446#define XFS_BLF_PDQUOT_BUF (1<<3)
447#define XFS_BLF_GDQUOT_BUF (1<<4)
448
449/*
450 * This is the structure used to lay out a buf log item in the
451 * log. The data map describes which 128 byte chunks of the buffer
452 * have been logged.
453 */
454#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
455
456typedef struct xfs_buf_log_format {
457 unsigned short blf_type; /* buf log item type indicator */
458 unsigned short blf_size; /* size of this item */
459 ushort blf_flags; /* misc state */
460 ushort blf_len; /* number of blocks in this buf */
461 __int64_t blf_blkno; /* starting blkno of this buf */
462 unsigned int blf_map_size; /* used size of data bitmap in words */
463 unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
464} xfs_buf_log_format_t;
465
466/*
467 * All buffers now need to tell recovery where the magic number
468 * is so that it can verify and calculate the CRCs on the buffer correctly
469 * once the changes have been replayed into the buffer.
470 *
471 * The type value is held in the upper 5 bits of the blf_flags field, which is
472 * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
473 */
474#define XFS_BLFT_BITS 5
475#define XFS_BLFT_SHIFT 11
476#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
477
478enum xfs_blft {
479 XFS_BLFT_UNKNOWN_BUF = 0,
480 XFS_BLFT_UDQUOT_BUF,
481 XFS_BLFT_PDQUOT_BUF,
482 XFS_BLFT_GDQUOT_BUF,
483 XFS_BLFT_BTREE_BUF,
484 XFS_BLFT_AGF_BUF,
485 XFS_BLFT_AGFL_BUF,
486 XFS_BLFT_AGI_BUF,
487 XFS_BLFT_DINO_BUF,
488 XFS_BLFT_SYMLINK_BUF,
489 XFS_BLFT_DIR_BLOCK_BUF,
490 XFS_BLFT_DIR_DATA_BUF,
491 XFS_BLFT_DIR_FREE_BUF,
492 XFS_BLFT_DIR_LEAF1_BUF,
493 XFS_BLFT_DIR_LEAFN_BUF,
494 XFS_BLFT_DA_NODE_BUF,
495 XFS_BLFT_ATTR_LEAF_BUF,
496 XFS_BLFT_ATTR_RMT_BUF,
497 XFS_BLFT_SB_BUF,
498 XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
499};
500
501static inline void
502xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
503{
504 ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
505 blf->blf_flags &= ~XFS_BLFT_MASK;
506 blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
507}
508
509static inline __uint16_t
510xfs_blft_from_flags(struct xfs_buf_log_format *blf)
511{
512 return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
513}
514
515/*
516 * EFI/EFD log format definitions
517 */
518typedef struct xfs_extent {
519 xfs_dfsbno_t ext_start;
520 xfs_extlen_t ext_len;
521} xfs_extent_t;
522
523/*
524 * Since an xfs_extent_t has types (start:64, len: 32)
525 * there are different alignments on 32 bit and 64 bit kernels.
526 * So we provide the different variants for use by a
527 * conversion routine.
528 */
529typedef struct xfs_extent_32 {
530 __uint64_t ext_start;
531 __uint32_t ext_len;
532} __attribute__((packed)) xfs_extent_32_t;
533
534typedef struct xfs_extent_64 {
535 __uint64_t ext_start;
536 __uint32_t ext_len;
537 __uint32_t ext_pad;
538} xfs_extent_64_t;
539
540/*
541 * This is the structure used to lay out an efi log item in the
542 * log. The efi_extents field is a variable size array whose
543 * size is given by efi_nextents.
544 */
545typedef struct xfs_efi_log_format {
546 __uint16_t efi_type; /* efi log item type */
547 __uint16_t efi_size; /* size of this item */
548 __uint32_t efi_nextents; /* # extents to free */
549 __uint64_t efi_id; /* efi identifier */
550 xfs_extent_t efi_extents[1]; /* array of extents to free */
551} xfs_efi_log_format_t;
552
553typedef struct xfs_efi_log_format_32 {
554 __uint16_t efi_type; /* efi log item type */
555 __uint16_t efi_size; /* size of this item */
556 __uint32_t efi_nextents; /* # extents to free */
557 __uint64_t efi_id; /* efi identifier */
558 xfs_extent_32_t efi_extents[1]; /* array of extents to free */
559} __attribute__((packed)) xfs_efi_log_format_32_t;
560
561typedef struct xfs_efi_log_format_64 {
562 __uint16_t efi_type; /* efi log item type */
563 __uint16_t efi_size; /* size of this item */
564 __uint32_t efi_nextents; /* # extents to free */
565 __uint64_t efi_id; /* efi identifier */
566 xfs_extent_64_t efi_extents[1]; /* array of extents to free */
567} xfs_efi_log_format_64_t;
568
569/*
570 * This is the structure used to lay out an efd log item in the
571 * log. The efd_extents array is a variable size array whose
572 * size is given by efd_nextents;
573 */
574typedef struct xfs_efd_log_format {
575 __uint16_t efd_type; /* efd log item type */
576 __uint16_t efd_size; /* size of this item */
577 __uint32_t efd_nextents; /* # of extents freed */
578 __uint64_t efd_efi_id; /* id of corresponding efi */
579 xfs_extent_t efd_extents[1]; /* array of extents freed */
580} xfs_efd_log_format_t;
581
582typedef struct xfs_efd_log_format_32 {
583 __uint16_t efd_type; /* efd log item type */
584 __uint16_t efd_size; /* size of this item */
585 __uint32_t efd_nextents; /* # of extents freed */
586 __uint64_t efd_efi_id; /* id of corresponding efi */
587 xfs_extent_32_t efd_extents[1]; /* array of extents freed */
588} __attribute__((packed)) xfs_efd_log_format_32_t;
589
590typedef struct xfs_efd_log_format_64 {
591 __uint16_t efd_type; /* efd log item type */
592 __uint16_t efd_size; /* size of this item */
593 __uint32_t efd_nextents; /* # of extents freed */
594 __uint64_t efd_efi_id; /* id of corresponding efi */
595 xfs_extent_64_t efd_extents[1]; /* array of extents freed */
596} xfs_efd_log_format_64_t;
597
598/*
599 * Dquot Log format definitions.
600 *
601 * The first two fields must be the type and size fitting into
602 * 32 bits : log_recovery code assumes that.
603 */
604typedef struct xfs_dq_logformat {
605 __uint16_t qlf_type; /* dquot log item type */
606 __uint16_t qlf_size; /* size of this item */
607 xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
608 __int64_t qlf_blkno; /* blkno of dquot buffer */
609 __int32_t qlf_len; /* len of dquot buffer */
610 __uint32_t qlf_boffset; /* off of dquot in buffer */
611} xfs_dq_logformat_t;
612
613/*
614 * log format struct for QUOTAOFF records.
615 * The first two fields must be the type and size fitting into
616 * 32 bits : log_recovery code assumes that.
617 * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
618 * to the first and ensures that the first logitem is taken out of the AIL
619 * only when the last one is securely committed.
620 */
621typedef struct xfs_qoff_logformat {
622 unsigned short qf_type; /* quotaoff log item type */
623 unsigned short qf_size; /* size of this item */
624 unsigned int qf_flags; /* USR and/or GRP */
625 char qf_pad[12]; /* padding for future */
626} xfs_qoff_logformat_t;
627
628/*
629 * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
630 */
631#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
632#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
633#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
634#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */
635#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */
636#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */
637#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
638
639/*
640 * Conversion to and from the combined OQUOTA flag (if necessary)
641 * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
642 */
643#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
644#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
645#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
646#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
647
648#define XFS_ALL_QUOTA_ACCT \
649 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
650#define XFS_ALL_QUOTA_ENFD \
651 (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
652#define XFS_ALL_QUOTA_CHKD \
653 (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
654
655#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
656 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
657 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
658 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
659 XFS_PQUOTA_CHKD)
660
661/*
662 * Inode create log item structure
663 *
664 * Log recovery assumes the first two entries are the type and size and they fit
665 * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
666 * decoding can be done correctly.
667 */
668struct xfs_icreate_log {
669 __uint16_t icl_type; /* type of log format structure */
670 __uint16_t icl_size; /* size of log format structure */
671 __be32 icl_ag; /* ag being allocated in */
672 __be32 icl_agbno; /* start block of inode range */
673 __be32 icl_count; /* number of inodes to initialise */
674 __be32 icl_isize; /* size of inodes */
675 __be32 icl_length; /* length of extent to initialise */
676 __be32 icl_gen; /* inode generation number to use */
677};
678
679#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
new file mode 100644
index 000000000000..1c55ccbb379d
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -0,0 +1,66 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_LOG_RECOVER_H__
19#define __XFS_LOG_RECOVER_H__
20
21/*
22 * Macros, structures, prototypes for internal log manager use.
23 */
24
25#define XLOG_RHASH_BITS 4
26#define XLOG_RHASH_SIZE 16
27#define XLOG_RHASH_SHIFT 2
28#define XLOG_RHASH(tid) \
29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
30
31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
32
33
34/*
35 * item headers are in ri_buf[0]. Additional buffers follow.
36 */
37typedef struct xlog_recover_item {
38 struct list_head ri_list;
39 int ri_type;
40 int ri_cnt; /* count of regions found */
41 int ri_total; /* total regions */
42 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
43} xlog_recover_item_t;
44
45struct xlog_tid;
46typedef struct xlog_recover {
47 struct hlist_node r_list;
48 xlog_tid_t r_log_tid; /* log's transaction id */
49 xfs_trans_header_t r_theader; /* trans header for partial */
50 int r_state; /* not needed */
51 xfs_lsn_t r_lsn; /* xact lsn */
52 struct list_head r_itemq; /* q for items */
53} xlog_recover_t;
54
55#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr)
56
57/*
58 * This is the number of entries in the l_buf_cancel_table used during
59 * recovery.
60 */
61#define XLOG_BC_TABLE_SIZE 64
62
63#define XLOG_RECOVER_PASS1 1
64#define XLOG_RECOVER_PASS2 2
65
66#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
new file mode 100644
index 000000000000..ee7e0e80246b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -0,0 +1,150 @@
1/*
2 * Copyright (c) 2013 Jie Liu.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_ag.h"
25#include "xfs_sb.h"
26#include "xfs_mount.h"
27#include "xfs_da_format.h"
28#include "xfs_trans_space.h"
29#include "xfs_inode.h"
30#include "xfs_da_btree.h"
31#include "xfs_attr_leaf.h"
32#include "xfs_bmap_btree.h"
33
34/*
35 * Calculate the maximum length in bytes that would be required for a local
36 * attribute value as large attributes out of line are not logged.
37 */
38STATIC int
39xfs_log_calc_max_attrsetm_res(
40 struct xfs_mount *mp)
41{
42 int size;
43 int nblks;
44
45 size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
46 MAXNAMELEN - 1;
47 nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
48 nblks += XFS_B_TO_FSB(mp, size);
49 nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
50
51 return M_RES(mp)->tr_attrsetm.tr_logres +
52 M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
53}
54
55/*
56 * Iterate over the log space reservation table to figure out and return
57 * the maximum one in terms of the pre-calculated values which were done
58 * at mount time.
59 */
60STATIC void
61xfs_log_get_max_trans_res(
62 struct xfs_mount *mp,
63 struct xfs_trans_res *max_resp)
64{
65 struct xfs_trans_res *resp;
66 struct xfs_trans_res *end_resp;
67 int log_space = 0;
68 int attr_space;
69
70 attr_space = xfs_log_calc_max_attrsetm_res(mp);
71
72 resp = (struct xfs_trans_res *)M_RES(mp);
73 end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
74 for (; resp < end_resp; resp++) {
75 int tmp = resp->tr_logcount > 1 ?
76 resp->tr_logres * resp->tr_logcount :
77 resp->tr_logres;
78 if (log_space < tmp) {
79 log_space = tmp;
80 *max_resp = *resp; /* struct copy */
81 }
82 }
83
84 if (attr_space > log_space) {
85 *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
86 max_resp->tr_logres = attr_space;
87 }
88}
89
90/*
91 * Calculate the minimum valid log size for the given superblock configuration.
92 * Used to calculate the minimum log size at mkfs time, and to determine if
93 * the log is large enough or not at mount time. Returns the minimum size in
94 * filesystem block size units.
95 */
96int
97xfs_log_calc_minimum_size(
98 struct xfs_mount *mp)
99{
100 struct xfs_trans_res tres = {0};
101 int max_logres;
102 int min_logblks = 0;
103 int lsunit = 0;
104
105 xfs_log_get_max_trans_res(mp, &tres);
106
107 max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
108 if (tres.tr_logcount > 1)
109 max_logres *= tres.tr_logcount;
110
111 if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
112 lsunit = BTOBB(mp->m_sb.sb_logsunit);
113
114 /*
115 * Two factors should be taken into account for calculating the minimum
116 * log space.
117 * 1) The fundamental limitation is that no single transaction can be
118 * larger than half size of the log.
119 *
120 * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
121 * define, which is set to 3. That means we can definitely fit
122 * maximally sized 2 transactions in the log. We'll use this same
123 * value here.
124 *
125 * 2) If the lsunit option is specified, a transaction requires 2 LSU
126 * for the reservation because there are two log writes that can
127 * require padding - the transaction data and the commit record which
128 * are written separately and both can require padding to the LSU.
129 * Consider that we can have an active CIL reservation holding 2*LSU,
130 * but the CIL is not over a push threshold, in this case, if we
131 * don't have enough log space for at one new transaction, which
132 * includes another 2*LSU in the reservation, we will run into dead
133 * loop situation in log space grant procedure. i.e.
134 * xlog_grant_head_wait().
135 *
136 * Hence the log size needs to be able to contain two maximally sized
137 * and padded transactions, which is (2 * (2 * LSU + maxlres)).
138 *
139 * Also, the log size should be a multiple of the log stripe unit, round
140 * it up to lsunit boundary if lsunit is specified.
141 */
142 if (lsunit) {
143 min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
144 2 * lsunit;
145 } else
146 min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
147 min_logblks *= XFS_MIN_LOG_FACTOR;
148
149 return XFS_BB_TO_FSB(mp, min_logblks);
150}
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
new file mode 100644
index 000000000000..137e20937077
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -0,0 +1,161 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_QUOTA_DEFS_H__
19#define __XFS_QUOTA_DEFS_H__
20
21/*
22 * Quota definitions shared between user and kernel source trees.
23 */
24
25/*
26 * Even though users may not have quota limits occupying all 64-bits,
27 * they may need 64-bit accounting. Hence, 64-bit quota-counters,
28 * and quota-limits. This is a waste in the common case, but hey ...
29 */
30typedef __uint64_t xfs_qcnt_t;
31typedef __uint16_t xfs_qwarncnt_t;
32
33/*
34 * flags for q_flags field in the dquot.
35 */
36#define XFS_DQ_USER 0x0001 /* a user quota */
37#define XFS_DQ_PROJ 0x0002 /* project quota */
38#define XFS_DQ_GROUP 0x0004 /* a group quota */
39#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
40#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
41
42#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
43
44#define XFS_DQ_FLAGS \
45 { XFS_DQ_USER, "USER" }, \
46 { XFS_DQ_PROJ, "PROJ" }, \
47 { XFS_DQ_GROUP, "GROUP" }, \
48 { XFS_DQ_DIRTY, "DIRTY" }, \
49 { XFS_DQ_FREEING, "FREEING" }
50
51/*
52 * We have the possibility of all three quota types being active at once, and
53 * hence free space modification requires modification of all three current
54 * dquots in a single transaction. For this case we need to have a reservation
55 * of at least 3 dquots.
56 *
57 * However, a chmod operation can change both UID and GID in a single
58 * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
59 * modified. Hence for this case we need to reserve space for at least 4 dquots.
60 *
61 * And in the worst case, there's a rename operation that can be modifying up to
62 * 4 inodes with dquots attached to them. In reality, the only inodes that can
63 * have their dquots modified are the source and destination directory inodes
64 * due to directory name creation and removal. That can require space allocation
65 * and/or freeing on both directory inodes, and hence all three dquots on each
66 * inode can be modified. And if the directories are world writeable, all the
67 * dquots can be unique and so 6 dquots can be modified....
68 *
69 * And, of course, we also need to take into account the dquot log format item
70 * used to describe each dquot.
71 */
72#define XFS_DQUOT_LOGRES(mp) \
73 ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
74
75#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
76#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
77#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
78#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
79#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
80#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
81#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
82
83/*
84 * Incore only flags for quotaoff - these bits get cleared when quota(s)
85 * are in the process of getting turned off. These flags are in m_qflags but
86 * never in sb_qflags.
87 */
88#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
89#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
90#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
91#define XFS_ALL_QUOTA_ACTIVE \
92 (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
93
94/*
95 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
96 * quota will be not be switched off as long as that inode lock is held.
97 */
98#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
99 XFS_GQUOTA_ACTIVE | \
100 XFS_PQUOTA_ACTIVE))
101#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
102 XFS_PQUOTA_ACTIVE))
103#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
104#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
105#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
106
107/*
108 * Flags to tell various functions what to do. Not all of these are meaningful
109 * to a single function. None of these XFS_QMOPT_* flags are meant to have
110 * persistent values (ie. their values can and will change between versions)
111 */
112#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
113#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
114#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
115#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
116#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
117#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
118#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
119#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
120#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
121
122/*
123 * flags to xfs_trans_mod_dquot to indicate which field needs to be
124 * modified.
125 */
126#define XFS_QMOPT_RES_REGBLKS 0x0010000
127#define XFS_QMOPT_RES_RTBLKS 0x0020000
128#define XFS_QMOPT_BCOUNT 0x0040000
129#define XFS_QMOPT_ICOUNT 0x0080000
130#define XFS_QMOPT_RTBCOUNT 0x0100000
131#define XFS_QMOPT_DELBCOUNT 0x0200000
132#define XFS_QMOPT_DELRTBCOUNT 0x0400000
133#define XFS_QMOPT_RES_INOS 0x0800000
134
135/*
136 * flags for dqalloc.
137 */
138#define XFS_QMOPT_INHERIT 0x1000000
139
140/*
141 * flags to xfs_trans_mod_dquot.
142 */
143#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
144#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
145#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
146#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
147#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
148#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
149#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
150#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
151
152
153#define XFS_QMOPT_QUOTALL \
154 (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
155#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
156
157extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
158 xfs_dqid_t id, uint type, uint flags, char *str);
159extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
160
161#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
new file mode 100644
index 000000000000..f4dd697cac08
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -0,0 +1,973 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_inode.h"
29#include "xfs_bmap.h"
30#include "xfs_bmap_util.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc.h"
33#include "xfs_error.h"
34#include "xfs_trans.h"
35#include "xfs_trans_space.h"
36#include "xfs_trace.h"
37#include "xfs_buf.h"
38#include "xfs_icache.h"
39#include "xfs_dinode.h"
40#include "xfs_rtalloc.h"
41
42
43/*
44 * Realtime allocator bitmap functions shared with userspace.
45 */
46
47/*
48 * Get a buffer for the bitmap or summary file block specified.
49 * The buffer is returned read and locked.
50 */
51int
52xfs_rtbuf_get(
53 xfs_mount_t *mp, /* file system mount structure */
54 xfs_trans_t *tp, /* transaction pointer */
55 xfs_rtblock_t block, /* block number in bitmap or summary */
56 int issum, /* is summary not bitmap */
57 xfs_buf_t **bpp) /* output: buffer for the block */
58{
59 xfs_buf_t *bp; /* block buffer, result */
60 xfs_inode_t *ip; /* bitmap or summary inode */
61 xfs_bmbt_irec_t map;
62 int nmap = 1;
63 int error; /* error value */
64
65 ip = issum ? mp->m_rsumip : mp->m_rbmip;
66
67 error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
68 if (error)
69 return error;
70
71 ASSERT(map.br_startblock != NULLFSBLOCK);
72 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
73 XFS_FSB_TO_DADDR(mp, map.br_startblock),
74 mp->m_bsize, 0, &bp, NULL);
75 if (error)
76 return error;
77 *bpp = bp;
78 return 0;
79}
80
81/*
82 * Searching backward from start to limit, find the first block whose
83 * allocated/free state is different from start's.
84 */
85int
86xfs_rtfind_back(
87 xfs_mount_t *mp, /* file system mount point */
88 xfs_trans_t *tp, /* transaction pointer */
89 xfs_rtblock_t start, /* starting block to look at */
90 xfs_rtblock_t limit, /* last block to look at */
91 xfs_rtblock_t *rtblock) /* out: start block found */
92{
93 xfs_rtword_t *b; /* current word in buffer */
94 int bit; /* bit number in the word */
95 xfs_rtblock_t block; /* bitmap block number */
96 xfs_buf_t *bp; /* buf for the block */
97 xfs_rtword_t *bufp; /* starting word in buffer */
98 int error; /* error value */
99 xfs_rtblock_t firstbit; /* first useful bit in the word */
100 xfs_rtblock_t i; /* current bit number rel. to start */
101 xfs_rtblock_t len; /* length of inspected area */
102 xfs_rtword_t mask; /* mask of relevant bits for value */
103 xfs_rtword_t want; /* mask for "good" values */
104 xfs_rtword_t wdiff; /* difference from wanted value */
105 int word; /* word number in the buffer */
106
107 /*
108 * Compute and read in starting bitmap block for starting block.
109 */
110 block = XFS_BITTOBLOCK(mp, start);
111 error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
112 if (error) {
113 return error;
114 }
115 bufp = bp->b_addr;
116 /*
117 * Get the first word's index & point to it.
118 */
119 word = XFS_BITTOWORD(mp, start);
120 b = &bufp[word];
121 bit = (int)(start & (XFS_NBWORD - 1));
122 len = start - limit + 1;
123 /*
124 * Compute match value, based on the bit at start: if 1 (free)
125 * then all-ones, else all-zeroes.
126 */
127 want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
128 /*
129 * If the starting position is not word-aligned, deal with the
130 * partial word.
131 */
132 if (bit < XFS_NBWORD - 1) {
133 /*
134 * Calculate first (leftmost) bit number to look at,
135 * and mask for all the relevant bits in this word.
136 */
137 firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
138 mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
139 firstbit;
140 /*
141 * Calculate the difference between the value there
142 * and what we're looking for.
143 */
144 if ((wdiff = (*b ^ want) & mask)) {
145 /*
146 * Different. Mark where we are and return.
147 */
148 xfs_trans_brelse(tp, bp);
149 i = bit - XFS_RTHIBIT(wdiff);
150 *rtblock = start - i + 1;
151 return 0;
152 }
153 i = bit - firstbit + 1;
154 /*
155 * Go on to previous block if that's where the previous word is
156 * and we need the previous word.
157 */
158 if (--word == -1 && i < len) {
159 /*
160 * If done with this block, get the previous one.
161 */
162 xfs_trans_brelse(tp, bp);
163 error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
164 if (error) {
165 return error;
166 }
167 bufp = bp->b_addr;
168 word = XFS_BLOCKWMASK(mp);
169 b = &bufp[word];
170 } else {
171 /*
172 * Go on to the previous word in the buffer.
173 */
174 b--;
175 }
176 } else {
177 /*
178 * Starting on a word boundary, no partial word.
179 */
180 i = 0;
181 }
182 /*
183 * Loop over whole words in buffers. When we use up one buffer
184 * we move on to the previous one.
185 */
186 while (len - i >= XFS_NBWORD) {
187 /*
188 * Compute difference between actual and desired value.
189 */
190 if ((wdiff = *b ^ want)) {
191 /*
192 * Different, mark where we are and return.
193 */
194 xfs_trans_brelse(tp, bp);
195 i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
196 *rtblock = start - i + 1;
197 return 0;
198 }
199 i += XFS_NBWORD;
200 /*
201 * Go on to previous block if that's where the previous word is
202 * and we need the previous word.
203 */
204 if (--word == -1 && i < len) {
205 /*
206 * If done with this block, get the previous one.
207 */
208 xfs_trans_brelse(tp, bp);
209 error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
210 if (error) {
211 return error;
212 }
213 bufp = bp->b_addr;
214 word = XFS_BLOCKWMASK(mp);
215 b = &bufp[word];
216 } else {
217 /*
218 * Go on to the previous word in the buffer.
219 */
220 b--;
221 }
222 }
223 /*
224 * If not ending on a word boundary, deal with the last
225 * (partial) word.
226 */
227 if (len - i) {
228 /*
229 * Calculate first (leftmost) bit number to look at,
230 * and mask for all the relevant bits in this word.
231 */
232 firstbit = XFS_NBWORD - (len - i);
233 mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
234 /*
235 * Compute difference between actual and desired value.
236 */
237 if ((wdiff = (*b ^ want) & mask)) {
238 /*
239 * Different, mark where we are and return.
240 */
241 xfs_trans_brelse(tp, bp);
242 i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
243 *rtblock = start - i + 1;
244 return 0;
245 } else
246 i = len;
247 }
248 /*
249 * No match, return that we scanned the whole area.
250 */
251 xfs_trans_brelse(tp, bp);
252 *rtblock = start - i + 1;
253 return 0;
254}
255
256/*
257 * Searching forward from start to limit, find the first block whose
258 * allocated/free state is different from start's.
259 */
260int
261xfs_rtfind_forw(
262 xfs_mount_t *mp, /* file system mount point */
263 xfs_trans_t *tp, /* transaction pointer */
264 xfs_rtblock_t start, /* starting block to look at */
265 xfs_rtblock_t limit, /* last block to look at */
266 xfs_rtblock_t *rtblock) /* out: start block found */
267{
268 xfs_rtword_t *b; /* current word in buffer */
269 int bit; /* bit number in the word */
270 xfs_rtblock_t block; /* bitmap block number */
271 xfs_buf_t *bp; /* buf for the block */
272 xfs_rtword_t *bufp; /* starting word in buffer */
273 int error; /* error value */
274 xfs_rtblock_t i; /* current bit number rel. to start */
275 xfs_rtblock_t lastbit; /* last useful bit in the word */
276 xfs_rtblock_t len; /* length of inspected area */
277 xfs_rtword_t mask; /* mask of relevant bits for value */
278 xfs_rtword_t want; /* mask for "good" values */
279 xfs_rtword_t wdiff; /* difference from wanted value */
280 int word; /* word number in the buffer */
281
282 /*
283 * Compute and read in starting bitmap block for starting block.
284 */
285 block = XFS_BITTOBLOCK(mp, start);
286 error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
287 if (error) {
288 return error;
289 }
290 bufp = bp->b_addr;
291 /*
292 * Get the first word's index & point to it.
293 */
294 word = XFS_BITTOWORD(mp, start);
295 b = &bufp[word];
296 bit = (int)(start & (XFS_NBWORD - 1));
297 len = limit - start + 1;
298 /*
299 * Compute match value, based on the bit at start: if 1 (free)
300 * then all-ones, else all-zeroes.
301 */
302 want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
303 /*
304 * If the starting position is not word-aligned, deal with the
305 * partial word.
306 */
307 if (bit) {
308 /*
309 * Calculate last (rightmost) bit number to look at,
310 * and mask for all the relevant bits in this word.
311 */
312 lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
313 mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
314 /*
315 * Calculate the difference between the value there
316 * and what we're looking for.
317 */
318 if ((wdiff = (*b ^ want) & mask)) {
319 /*
320 * Different. Mark where we are and return.
321 */
322 xfs_trans_brelse(tp, bp);
323 i = XFS_RTLOBIT(wdiff) - bit;
324 *rtblock = start + i - 1;
325 return 0;
326 }
327 i = lastbit - bit;
328 /*
329 * Go on to next block if that's where the next word is
330 * and we need the next word.
331 */
332 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
333 /*
334 * If done with this block, get the previous one.
335 */
336 xfs_trans_brelse(tp, bp);
337 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
338 if (error) {
339 return error;
340 }
341 b = bufp = bp->b_addr;
342 word = 0;
343 } else {
344 /*
345 * Go on to the previous word in the buffer.
346 */
347 b++;
348 }
349 } else {
350 /*
351 * Starting on a word boundary, no partial word.
352 */
353 i = 0;
354 }
355 /*
356 * Loop over whole words in buffers. When we use up one buffer
357 * we move on to the next one.
358 */
359 while (len - i >= XFS_NBWORD) {
360 /*
361 * Compute difference between actual and desired value.
362 */
363 if ((wdiff = *b ^ want)) {
364 /*
365 * Different, mark where we are and return.
366 */
367 xfs_trans_brelse(tp, bp);
368 i += XFS_RTLOBIT(wdiff);
369 *rtblock = start + i - 1;
370 return 0;
371 }
372 i += XFS_NBWORD;
373 /*
374 * Go on to next block if that's where the next word is
375 * and we need the next word.
376 */
377 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
378 /*
379 * If done with this block, get the next one.
380 */
381 xfs_trans_brelse(tp, bp);
382 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
383 if (error) {
384 return error;
385 }
386 b = bufp = bp->b_addr;
387 word = 0;
388 } else {
389 /*
390 * Go on to the next word in the buffer.
391 */
392 b++;
393 }
394 }
395 /*
396 * If not ending on a word boundary, deal with the last
397 * (partial) word.
398 */
399 if ((lastbit = len - i)) {
400 /*
401 * Calculate mask for all the relevant bits in this word.
402 */
403 mask = ((xfs_rtword_t)1 << lastbit) - 1;
404 /*
405 * Compute difference between actual and desired value.
406 */
407 if ((wdiff = (*b ^ want) & mask)) {
408 /*
409 * Different, mark where we are and return.
410 */
411 xfs_trans_brelse(tp, bp);
412 i += XFS_RTLOBIT(wdiff);
413 *rtblock = start + i - 1;
414 return 0;
415 } else
416 i = len;
417 }
418 /*
419 * No match, return that we scanned the whole area.
420 */
421 xfs_trans_brelse(tp, bp);
422 *rtblock = start + i - 1;
423 return 0;
424}
425
426/*
427 * Read and modify the summary information for a given extent size,
428 * bitmap block combination.
429 * Keeps track of a current summary block, so we don't keep reading
430 * it from the buffer cache.
431 */
432int
433xfs_rtmodify_summary(
434 xfs_mount_t *mp, /* file system mount point */
435 xfs_trans_t *tp, /* transaction pointer */
436 int log, /* log2 of extent size */
437 xfs_rtblock_t bbno, /* bitmap block number */
438 int delta, /* change to make to summary info */
439 xfs_buf_t **rbpp, /* in/out: summary block buffer */
440 xfs_fsblock_t *rsb) /* in/out: summary block number */
441{
442 xfs_buf_t *bp; /* buffer for the summary block */
443 int error; /* error value */
444 xfs_fsblock_t sb; /* summary fsblock */
445 int so; /* index into the summary file */
446 xfs_suminfo_t *sp; /* pointer to returned data */
447
448 /*
449 * Compute entry number in the summary file.
450 */
451 so = XFS_SUMOFFS(mp, log, bbno);
452 /*
453 * Compute the block number in the summary file.
454 */
455 sb = XFS_SUMOFFSTOBLOCK(mp, so);
456 /*
457 * If we have an old buffer, and the block number matches, use that.
458 */
459 if (rbpp && *rbpp && *rsb == sb)
460 bp = *rbpp;
461 /*
462 * Otherwise we have to get the buffer.
463 */
464 else {
465 /*
466 * If there was an old one, get rid of it first.
467 */
468 if (rbpp && *rbpp)
469 xfs_trans_brelse(tp, *rbpp);
470 error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
471 if (error) {
472 return error;
473 }
474 /*
475 * Remember this buffer and block for the next call.
476 */
477 if (rbpp) {
478 *rbpp = bp;
479 *rsb = sb;
480 }
481 }
482 /*
483 * Point to the summary information, modify and log it.
484 */
485 sp = XFS_SUMPTR(mp, bp, so);
486 *sp += delta;
487 xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
488 (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
489 return 0;
490}
491
492/*
493 * Set the given range of bitmap bits to the given value.
494 * Do whatever I/O and logging is required.
495 */
496int
497xfs_rtmodify_range(
498 xfs_mount_t *mp, /* file system mount point */
499 xfs_trans_t *tp, /* transaction pointer */
500 xfs_rtblock_t start, /* starting block to modify */
501 xfs_extlen_t len, /* length of extent to modify */
502 int val) /* 1 for free, 0 for allocated */
503{
504 xfs_rtword_t *b; /* current word in buffer */
505 int bit; /* bit number in the word */
506 xfs_rtblock_t block; /* bitmap block number */
507 xfs_buf_t *bp; /* buf for the block */
508 xfs_rtword_t *bufp; /* starting word in buffer */
509 int error; /* error value */
510 xfs_rtword_t *first; /* first used word in the buffer */
511 int i; /* current bit number rel. to start */
512 int lastbit; /* last useful bit in word */
513 xfs_rtword_t mask; /* mask o frelevant bits for value */
514 int word; /* word number in the buffer */
515
516 /*
517 * Compute starting bitmap block number.
518 */
519 block = XFS_BITTOBLOCK(mp, start);
520 /*
521 * Read the bitmap block, and point to its data.
522 */
523 error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
524 if (error) {
525 return error;
526 }
527 bufp = bp->b_addr;
528 /*
529 * Compute the starting word's address, and starting bit.
530 */
531 word = XFS_BITTOWORD(mp, start);
532 first = b = &bufp[word];
533 bit = (int)(start & (XFS_NBWORD - 1));
534 /*
535 * 0 (allocated) => all zeroes; 1 (free) => all ones.
536 */
537 val = -val;
538 /*
539 * If not starting on a word boundary, deal with the first
540 * (partial) word.
541 */
542 if (bit) {
543 /*
544 * Compute first bit not changed and mask of relevant bits.
545 */
546 lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
547 mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
548 /*
549 * Set/clear the active bits.
550 */
551 if (val)
552 *b |= mask;
553 else
554 *b &= ~mask;
555 i = lastbit - bit;
556 /*
557 * Go on to the next block if that's where the next word is
558 * and we need the next word.
559 */
560 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
561 /*
562 * Log the changed part of this block.
563 * Get the next one.
564 */
565 xfs_trans_log_buf(tp, bp,
566 (uint)((char *)first - (char *)bufp),
567 (uint)((char *)b - (char *)bufp));
568 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
569 if (error) {
570 return error;
571 }
572 first = b = bufp = bp->b_addr;
573 word = 0;
574 } else {
575 /*
576 * Go on to the next word in the buffer
577 */
578 b++;
579 }
580 } else {
581 /*
582 * Starting on a word boundary, no partial word.
583 */
584 i = 0;
585 }
586 /*
587 * Loop over whole words in buffers. When we use up one buffer
588 * we move on to the next one.
589 */
590 while (len - i >= XFS_NBWORD) {
591 /*
592 * Set the word value correctly.
593 */
594 *b = val;
595 i += XFS_NBWORD;
596 /*
597 * Go on to the next block if that's where the next word is
598 * and we need the next word.
599 */
600 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
601 /*
602 * Log the changed part of this block.
603 * Get the next one.
604 */
605 xfs_trans_log_buf(tp, bp,
606 (uint)((char *)first - (char *)bufp),
607 (uint)((char *)b - (char *)bufp));
608 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
609 if (error) {
610 return error;
611 }
612 first = b = bufp = bp->b_addr;
613 word = 0;
614 } else {
615 /*
616 * Go on to the next word in the buffer
617 */
618 b++;
619 }
620 }
621 /*
622 * If not ending on a word boundary, deal with the last
623 * (partial) word.
624 */
625 if ((lastbit = len - i)) {
626 /*
627 * Compute a mask of relevant bits.
628 */
629 bit = 0;
630 mask = ((xfs_rtword_t)1 << lastbit) - 1;
631 /*
632 * Set/clear the active bits.
633 */
634 if (val)
635 *b |= mask;
636 else
637 *b &= ~mask;
638 b++;
639 }
640 /*
641 * Log any remaining changed bytes.
642 */
643 if (b > first)
644 xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
645 (uint)((char *)b - (char *)bufp - 1));
646 return 0;
647}
648
649/*
650 * Mark an extent specified by start and len freed.
651 * Updates all the summary information as well as the bitmap.
652 */
653int
654xfs_rtfree_range(
655 xfs_mount_t *mp, /* file system mount point */
656 xfs_trans_t *tp, /* transaction pointer */
657 xfs_rtblock_t start, /* starting block to free */
658 xfs_extlen_t len, /* length to free */
659 xfs_buf_t **rbpp, /* in/out: summary block buffer */
660 xfs_fsblock_t *rsb) /* in/out: summary block number */
661{
662 xfs_rtblock_t end; /* end of the freed extent */
663 int error; /* error value */
664 xfs_rtblock_t postblock; /* first block freed > end */
665 xfs_rtblock_t preblock; /* first block freed < start */
666
667 end = start + len - 1;
668 /*
669 * Modify the bitmap to mark this extent freed.
670 */
671 error = xfs_rtmodify_range(mp, tp, start, len, 1);
672 if (error) {
673 return error;
674 }
675 /*
676 * Assume we're freeing out of the middle of an allocated extent.
677 * We need to find the beginning and end of the extent so we can
678 * properly update the summary.
679 */
680 error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
681 if (error) {
682 return error;
683 }
684 /*
685 * Find the next allocated block (end of allocated extent).
686 */
687 error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
688 &postblock);
689 if (error)
690 return error;
691 /*
692 * If there are blocks not being freed at the front of the
693 * old extent, add summary data for them to be allocated.
694 */
695 if (preblock < start) {
696 error = xfs_rtmodify_summary(mp, tp,
697 XFS_RTBLOCKLOG(start - preblock),
698 XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
699 if (error) {
700 return error;
701 }
702 }
703 /*
704 * If there are blocks not being freed at the end of the
705 * old extent, add summary data for them to be allocated.
706 */
707 if (postblock > end) {
708 error = xfs_rtmodify_summary(mp, tp,
709 XFS_RTBLOCKLOG(postblock - end),
710 XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
711 if (error) {
712 return error;
713 }
714 }
715 /*
716 * Increment the summary information corresponding to the entire
717 * (new) free extent.
718 */
719 error = xfs_rtmodify_summary(mp, tp,
720 XFS_RTBLOCKLOG(postblock + 1 - preblock),
721 XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
722 return error;
723}
724
725/*
726 * Check that the given range is either all allocated (val = 0) or
727 * all free (val = 1).
728 */
729int
730xfs_rtcheck_range(
731 xfs_mount_t *mp, /* file system mount point */
732 xfs_trans_t *tp, /* transaction pointer */
733 xfs_rtblock_t start, /* starting block number of extent */
734 xfs_extlen_t len, /* length of extent */
735 int val, /* 1 for free, 0 for allocated */
736 xfs_rtblock_t *new, /* out: first block not matching */
737 int *stat) /* out: 1 for matches, 0 for not */
738{
739 xfs_rtword_t *b; /* current word in buffer */
740 int bit; /* bit number in the word */
741 xfs_rtblock_t block; /* bitmap block number */
742 xfs_buf_t *bp; /* buf for the block */
743 xfs_rtword_t *bufp; /* starting word in buffer */
744 int error; /* error value */
745 xfs_rtblock_t i; /* current bit number rel. to start */
746 xfs_rtblock_t lastbit; /* last useful bit in word */
747 xfs_rtword_t mask; /* mask of relevant bits for value */
748 xfs_rtword_t wdiff; /* difference from wanted value */
749 int word; /* word number in the buffer */
750
751 /*
752 * Compute starting bitmap block number
753 */
754 block = XFS_BITTOBLOCK(mp, start);
755 /*
756 * Read the bitmap block.
757 */
758 error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
759 if (error) {
760 return error;
761 }
762 bufp = bp->b_addr;
763 /*
764 * Compute the starting word's address, and starting bit.
765 */
766 word = XFS_BITTOWORD(mp, start);
767 b = &bufp[word];
768 bit = (int)(start & (XFS_NBWORD - 1));
769 /*
770 * 0 (allocated) => all zero's; 1 (free) => all one's.
771 */
772 val = -val;
773 /*
774 * If not starting on a word boundary, deal with the first
775 * (partial) word.
776 */
777 if (bit) {
778 /*
779 * Compute first bit not examined.
780 */
781 lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
782 /*
783 * Mask of relevant bits.
784 */
785 mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
786 /*
787 * Compute difference between actual and desired value.
788 */
789 if ((wdiff = (*b ^ val) & mask)) {
790 /*
791 * Different, compute first wrong bit and return.
792 */
793 xfs_trans_brelse(tp, bp);
794 i = XFS_RTLOBIT(wdiff) - bit;
795 *new = start + i;
796 *stat = 0;
797 return 0;
798 }
799 i = lastbit - bit;
800 /*
801 * Go on to next block if that's where the next word is
802 * and we need the next word.
803 */
804 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
805 /*
806 * If done with this block, get the next one.
807 */
808 xfs_trans_brelse(tp, bp);
809 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
810 if (error) {
811 return error;
812 }
813 b = bufp = bp->b_addr;
814 word = 0;
815 } else {
816 /*
817 * Go on to the next word in the buffer.
818 */
819 b++;
820 }
821 } else {
822 /*
823 * Starting on a word boundary, no partial word.
824 */
825 i = 0;
826 }
827 /*
828 * Loop over whole words in buffers. When we use up one buffer
829 * we move on to the next one.
830 */
831 while (len - i >= XFS_NBWORD) {
832 /*
833 * Compute difference between actual and desired value.
834 */
835 if ((wdiff = *b ^ val)) {
836 /*
837 * Different, compute first wrong bit and return.
838 */
839 xfs_trans_brelse(tp, bp);
840 i += XFS_RTLOBIT(wdiff);
841 *new = start + i;
842 *stat = 0;
843 return 0;
844 }
845 i += XFS_NBWORD;
846 /*
847 * Go on to next block if that's where the next word is
848 * and we need the next word.
849 */
850 if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
851 /*
852 * If done with this block, get the next one.
853 */
854 xfs_trans_brelse(tp, bp);
855 error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
856 if (error) {
857 return error;
858 }
859 b = bufp = bp->b_addr;
860 word = 0;
861 } else {
862 /*
863 * Go on to the next word in the buffer.
864 */
865 b++;
866 }
867 }
868 /*
869 * If not ending on a word boundary, deal with the last
870 * (partial) word.
871 */
872 if ((lastbit = len - i)) {
873 /*
874 * Mask of relevant bits.
875 */
876 mask = ((xfs_rtword_t)1 << lastbit) - 1;
877 /*
878 * Compute difference between actual and desired value.
879 */
880 if ((wdiff = (*b ^ val) & mask)) {
881 /*
882 * Different, compute first wrong bit and return.
883 */
884 xfs_trans_brelse(tp, bp);
885 i += XFS_RTLOBIT(wdiff);
886 *new = start + i;
887 *stat = 0;
888 return 0;
889 } else
890 i = len;
891 }
892 /*
893 * Successful, return.
894 */
895 xfs_trans_brelse(tp, bp);
896 *new = start + i;
897 *stat = 1;
898 return 0;
899}
900
901#ifdef DEBUG
902/*
903 * Check that the given extent (block range) is allocated already.
904 */
905STATIC int /* error */
906xfs_rtcheck_alloc_range(
907 xfs_mount_t *mp, /* file system mount point */
908 xfs_trans_t *tp, /* transaction pointer */
909 xfs_rtblock_t bno, /* starting block number of extent */
910 xfs_extlen_t len) /* length of extent */
911{
912 xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
913 int stat;
914 int error;
915
916 error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
917 if (error)
918 return error;
919 ASSERT(stat);
920 return 0;
921}
922#else
923#define xfs_rtcheck_alloc_range(m,t,b,l) (0)
924#endif
925/*
926 * Free an extent in the realtime subvolume. Length is expressed in
927 * realtime extents, as is the block number.
928 */
929int /* error */
930xfs_rtfree_extent(
931 xfs_trans_t *tp, /* transaction pointer */
932 xfs_rtblock_t bno, /* starting block number to free */
933 xfs_extlen_t len) /* length of extent freed */
934{
935 int error; /* error value */
936 xfs_mount_t *mp; /* file system mount structure */
937 xfs_fsblock_t sb; /* summary file block number */
938 xfs_buf_t *sumbp = NULL; /* summary file block buffer */
939
940 mp = tp->t_mountp;
941
942 ASSERT(mp->m_rbmip->i_itemp != NULL);
943 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
944
945 error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
946 if (error)
947 return error;
948
949 /*
950 * Free the range of realtime blocks.
951 */
952 error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
953 if (error) {
954 return error;
955 }
956 /*
957 * Mark more blocks free in the superblock.
958 */
959 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
960 /*
961 * If we've now freed all the blocks, reset the file sequence
962 * number to 0.
963 */
964 if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
965 mp->m_sb.sb_rextents) {
966 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
967 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
968 *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
969 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
970 }
971 return 0;
972}
973
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
new file mode 100644
index 000000000000..6e93b5ef0a6b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -0,0 +1,836 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_shared.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h"
24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_inode.h"
29#include "xfs_ialloc.h"
30#include "xfs_alloc.h"
31#include "xfs_error.h"
32#include "xfs_trace.h"
33#include "xfs_cksum.h"
34#include "xfs_trans.h"
35#include "xfs_buf_item.h"
36#include "xfs_dinode.h"
37#include "xfs_bmap_btree.h"
38#include "xfs_alloc_btree.h"
39#include "xfs_ialloc_btree.h"
40
41/*
42 * Physical superblock buffer manipulations. Shared with libxfs in userspace.
43 */
44
45static const struct {
46 short offset;
47 short type; /* 0 = integer
48 * 1 = binary / string (no translation)
49 */
50} xfs_sb_info[] = {
51 { offsetof(xfs_sb_t, sb_magicnum), 0 },
52 { offsetof(xfs_sb_t, sb_blocksize), 0 },
53 { offsetof(xfs_sb_t, sb_dblocks), 0 },
54 { offsetof(xfs_sb_t, sb_rblocks), 0 },
55 { offsetof(xfs_sb_t, sb_rextents), 0 },
56 { offsetof(xfs_sb_t, sb_uuid), 1 },
57 { offsetof(xfs_sb_t, sb_logstart), 0 },
58 { offsetof(xfs_sb_t, sb_rootino), 0 },
59 { offsetof(xfs_sb_t, sb_rbmino), 0 },
60 { offsetof(xfs_sb_t, sb_rsumino), 0 },
61 { offsetof(xfs_sb_t, sb_rextsize), 0 },
62 { offsetof(xfs_sb_t, sb_agblocks), 0 },
63 { offsetof(xfs_sb_t, sb_agcount), 0 },
64 { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
65 { offsetof(xfs_sb_t, sb_logblocks), 0 },
66 { offsetof(xfs_sb_t, sb_versionnum), 0 },
67 { offsetof(xfs_sb_t, sb_sectsize), 0 },
68 { offsetof(xfs_sb_t, sb_inodesize), 0 },
69 { offsetof(xfs_sb_t, sb_inopblock), 0 },
70 { offsetof(xfs_sb_t, sb_fname[0]), 1 },
71 { offsetof(xfs_sb_t, sb_blocklog), 0 },
72 { offsetof(xfs_sb_t, sb_sectlog), 0 },
73 { offsetof(xfs_sb_t, sb_inodelog), 0 },
74 { offsetof(xfs_sb_t, sb_inopblog), 0 },
75 { offsetof(xfs_sb_t, sb_agblklog), 0 },
76 { offsetof(xfs_sb_t, sb_rextslog), 0 },
77 { offsetof(xfs_sb_t, sb_inprogress), 0 },
78 { offsetof(xfs_sb_t, sb_imax_pct), 0 },
79 { offsetof(xfs_sb_t, sb_icount), 0 },
80 { offsetof(xfs_sb_t, sb_ifree), 0 },
81 { offsetof(xfs_sb_t, sb_fdblocks), 0 },
82 { offsetof(xfs_sb_t, sb_frextents), 0 },
83 { offsetof(xfs_sb_t, sb_uquotino), 0 },
84 { offsetof(xfs_sb_t, sb_gquotino), 0 },
85 { offsetof(xfs_sb_t, sb_qflags), 0 },
86 { offsetof(xfs_sb_t, sb_flags), 0 },
87 { offsetof(xfs_sb_t, sb_shared_vn), 0 },
88 { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
89 { offsetof(xfs_sb_t, sb_unit), 0 },
90 { offsetof(xfs_sb_t, sb_width), 0 },
91 { offsetof(xfs_sb_t, sb_dirblklog), 0 },
92 { offsetof(xfs_sb_t, sb_logsectlog), 0 },
93 { offsetof(xfs_sb_t, sb_logsectsize), 0 },
94 { offsetof(xfs_sb_t, sb_logsunit), 0 },
95 { offsetof(xfs_sb_t, sb_features2), 0 },
96 { offsetof(xfs_sb_t, sb_bad_features2), 0 },
97 { offsetof(xfs_sb_t, sb_features_compat), 0 },
98 { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
99 { offsetof(xfs_sb_t, sb_features_incompat), 0 },
100 { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
101 { offsetof(xfs_sb_t, sb_crc), 0 },
102 { offsetof(xfs_sb_t, sb_pad), 0 },
103 { offsetof(xfs_sb_t, sb_pquotino), 0 },
104 { offsetof(xfs_sb_t, sb_lsn), 0 },
105 { sizeof(xfs_sb_t), 0 }
106};
107
108/*
109 * Reference counting access wrappers to the perag structures.
110 * Because we never free per-ag structures, the only thing we
111 * have to protect against changes is the tree structure itself.
112 */
113struct xfs_perag *
114xfs_perag_get(
115 struct xfs_mount *mp,
116 xfs_agnumber_t agno)
117{
118 struct xfs_perag *pag;
119 int ref = 0;
120
121 rcu_read_lock();
122 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
123 if (pag) {
124 ASSERT(atomic_read(&pag->pag_ref) >= 0);
125 ref = atomic_inc_return(&pag->pag_ref);
126 }
127 rcu_read_unlock();
128 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
129 return pag;
130}
131
132/*
133 * search from @first to find the next perag with the given tag set.
134 */
135struct xfs_perag *
136xfs_perag_get_tag(
137 struct xfs_mount *mp,
138 xfs_agnumber_t first,
139 int tag)
140{
141 struct xfs_perag *pag;
142 int found;
143 int ref;
144
145 rcu_read_lock();
146 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
147 (void **)&pag, first, 1, tag);
148 if (found <= 0) {
149 rcu_read_unlock();
150 return NULL;
151 }
152 ref = atomic_inc_return(&pag->pag_ref);
153 rcu_read_unlock();
154 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
155 return pag;
156}
157
158void
159xfs_perag_put(
160 struct xfs_perag *pag)
161{
162 int ref;
163
164 ASSERT(atomic_read(&pag->pag_ref) > 0);
165 ref = atomic_dec_return(&pag->pag_ref);
166 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
167}
168
169/*
170 * Check the validity of the SB found.
171 */
172STATIC int
173xfs_mount_validate_sb(
174 xfs_mount_t *mp,
175 xfs_sb_t *sbp,
176 bool check_inprogress,
177 bool check_version)
178{
179
180 /*
181 * If the log device and data device have the
182 * same device number, the log is internal.
183 * Consequently, the sb_logstart should be non-zero. If
184 * we have a zero sb_logstart in this case, we may be trying to mount
185 * a volume filesystem in a non-volume manner.
186 */
187 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
188 xfs_warn(mp, "bad magic number");
189 return -EWRONGFS;
190 }
191
192
193 if (!xfs_sb_good_version(sbp)) {
194 xfs_warn(mp, "bad version");
195 return -EWRONGFS;
196 }
197
198 /*
199 * Version 5 superblock feature mask validation. Reject combinations the
200 * kernel cannot support up front before checking anything else. For
201 * write validation, we don't need to check feature masks.
202 */
203 if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
204 if (xfs_sb_has_compat_feature(sbp,
205 XFS_SB_FEAT_COMPAT_UNKNOWN)) {
206 xfs_warn(mp,
207"Superblock has unknown compatible features (0x%x) enabled.\n"
208"Using a more recent kernel is recommended.",
209 (sbp->sb_features_compat &
210 XFS_SB_FEAT_COMPAT_UNKNOWN));
211 }
212
213 if (xfs_sb_has_ro_compat_feature(sbp,
214 XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
215 xfs_alert(mp,
216"Superblock has unknown read-only compatible features (0x%x) enabled.",
217 (sbp->sb_features_ro_compat &
218 XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
219 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
220 xfs_warn(mp,
221"Attempted to mount read-only compatible filesystem read-write.\n"
222"Filesystem can only be safely mounted read only.");
223 return -EINVAL;
224 }
225 }
226 if (xfs_sb_has_incompat_feature(sbp,
227 XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
228 xfs_warn(mp,
229"Superblock has unknown incompatible features (0x%x) enabled.\n"
230"Filesystem can not be safely mounted by this kernel.",
231 (sbp->sb_features_incompat &
232 XFS_SB_FEAT_INCOMPAT_UNKNOWN));
233 return -EINVAL;
234 }
235 }
236
237 if (xfs_sb_version_has_pquotino(sbp)) {
238 if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
239 xfs_notice(mp,
240 "Version 5 of Super block has XFS_OQUOTA bits.");
241 return -EFSCORRUPTED;
242 }
243 } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
244 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
245 xfs_notice(mp,
246"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
247 return -EFSCORRUPTED;
248 }
249
250 if (unlikely(
251 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
252 xfs_warn(mp,
253 "filesystem is marked as having an external log; "
254 "specify logdev on the mount command line.");
255 return -EINVAL;
256 }
257
258 if (unlikely(
259 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
260 xfs_warn(mp,
261 "filesystem is marked as having an internal log; "
262 "do not specify logdev on the mount command line.");
263 return -EINVAL;
264 }
265
266 /*
267 * More sanity checking. Most of these were stolen directly from
268 * xfs_repair.
269 */
270 if (unlikely(
271 sbp->sb_agcount <= 0 ||
272 sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
273 sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
274 sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
275 sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
276 sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
277 sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
278 sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
279 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
280 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
281 sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
282 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
283 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
284 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
285 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
286 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
287 sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
288 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
289 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
290 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
291 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
292 sbp->sb_dblocks == 0 ||
293 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
294 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) ||
295 sbp->sb_shared_vn != 0)) {
296 xfs_notice(mp, "SB sanity check failed");
297 return -EFSCORRUPTED;
298 }
299
300 /*
301 * Until this is fixed only page-sized or smaller data blocks work.
302 */
303 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
304 xfs_warn(mp,
305 "File system with blocksize %d bytes. "
306 "Only pagesize (%ld) or less will currently work.",
307 sbp->sb_blocksize, PAGE_SIZE);
308 return -ENOSYS;
309 }
310
311 /*
312 * Currently only very few inode sizes are supported.
313 */
314 switch (sbp->sb_inodesize) {
315 case 256:
316 case 512:
317 case 1024:
318 case 2048:
319 break;
320 default:
321 xfs_warn(mp, "inode size of %d bytes not supported",
322 sbp->sb_inodesize);
323 return -ENOSYS;
324 }
325
326 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
327 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
328 xfs_warn(mp,
329 "file system too large to be mounted on this system.");
330 return -EFBIG;
331 }
332
333 if (check_inprogress && sbp->sb_inprogress) {
334 xfs_warn(mp, "Offline file system operation in progress!");
335 return -EFSCORRUPTED;
336 }
337 return 0;
338}
339
340void
341xfs_sb_quota_from_disk(struct xfs_sb *sbp)
342{
343 /*
344 * older mkfs doesn't initialize quota inodes to NULLFSINO. This
345 * leads to in-core values having two different values for a quota
346 * inode to be invalid: 0 and NULLFSINO. Change it to a single value
347 * NULLFSINO.
348 *
349 * Note that this change affect only the in-core values. These
350 * values are not written back to disk unless any quota information
351 * is written to the disk. Even in that case, sb_pquotino field is
352 * not written to disk unless the superblock supports pquotino.
353 */
354 if (sbp->sb_uquotino == 0)
355 sbp->sb_uquotino = NULLFSINO;
356 if (sbp->sb_gquotino == 0)
357 sbp->sb_gquotino = NULLFSINO;
358 if (sbp->sb_pquotino == 0)
359 sbp->sb_pquotino = NULLFSINO;
360
361 /*
362 * We need to do these manipilations only if we are working
363 * with an older version of on-disk superblock.
364 */
365 if (xfs_sb_version_has_pquotino(sbp))
366 return;
367
368 if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
369 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
370 XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
371 if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
372 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
373 XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
374 sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
375
376 if (sbp->sb_qflags & XFS_PQUOTA_ACCT) {
377 /*
378 * In older version of superblock, on-disk superblock only
379 * has sb_gquotino, and in-core superblock has both sb_gquotino
380 * and sb_pquotino. But, only one of them is supported at any
381 * point of time. So, if PQUOTA is set in disk superblock,
382 * copy over sb_gquotino to sb_pquotino.
383 */
384 sbp->sb_pquotino = sbp->sb_gquotino;
385 sbp->sb_gquotino = NULLFSINO;
386 }
387}
388
389void
390xfs_sb_from_disk(
391 struct xfs_sb *to,
392 xfs_dsb_t *from)
393{
394 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
395 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
396 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
397 to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
398 to->sb_rextents = be64_to_cpu(from->sb_rextents);
399 memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
400 to->sb_logstart = be64_to_cpu(from->sb_logstart);
401 to->sb_rootino = be64_to_cpu(from->sb_rootino);
402 to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
403 to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
404 to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
405 to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
406 to->sb_agcount = be32_to_cpu(from->sb_agcount);
407 to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
408 to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
409 to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
410 to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
411 to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
412 to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
413 memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
414 to->sb_blocklog = from->sb_blocklog;
415 to->sb_sectlog = from->sb_sectlog;
416 to->sb_inodelog = from->sb_inodelog;
417 to->sb_inopblog = from->sb_inopblog;
418 to->sb_agblklog = from->sb_agblklog;
419 to->sb_rextslog = from->sb_rextslog;
420 to->sb_inprogress = from->sb_inprogress;
421 to->sb_imax_pct = from->sb_imax_pct;
422 to->sb_icount = be64_to_cpu(from->sb_icount);
423 to->sb_ifree = be64_to_cpu(from->sb_ifree);
424 to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
425 to->sb_frextents = be64_to_cpu(from->sb_frextents);
426 to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
427 to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
428 to->sb_qflags = be16_to_cpu(from->sb_qflags);
429 to->sb_flags = from->sb_flags;
430 to->sb_shared_vn = from->sb_shared_vn;
431 to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
432 to->sb_unit = be32_to_cpu(from->sb_unit);
433 to->sb_width = be32_to_cpu(from->sb_width);
434 to->sb_dirblklog = from->sb_dirblklog;
435 to->sb_logsectlog = from->sb_logsectlog;
436 to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
437 to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
438 to->sb_features2 = be32_to_cpu(from->sb_features2);
439 to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
440 to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
441 to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
442 to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
443 to->sb_features_log_incompat =
444 be32_to_cpu(from->sb_features_log_incompat);
445 to->sb_pad = 0;
446 to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
447 to->sb_lsn = be64_to_cpu(from->sb_lsn);
448}
449
450static inline void
451xfs_sb_quota_to_disk(
452 xfs_dsb_t *to,
453 xfs_sb_t *from,
454 __int64_t *fields)
455{
456 __uint16_t qflags = from->sb_qflags;
457
458 /*
459 * We need to do these manipilations only if we are working
460 * with an older version of on-disk superblock.
461 */
462 if (xfs_sb_version_has_pquotino(from))
463 return;
464
465 if (*fields & XFS_SB_QFLAGS) {
466 /*
467 * The in-core version of sb_qflags do not have
468 * XFS_OQUOTA_* flags, whereas the on-disk version
469 * does. So, convert incore XFS_{PG}QUOTA_* flags
470 * to on-disk XFS_OQUOTA_* flags.
471 */
472 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
473 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
474
475 if (from->sb_qflags &
476 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
477 qflags |= XFS_OQUOTA_ENFD;
478 if (from->sb_qflags &
479 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
480 qflags |= XFS_OQUOTA_CHKD;
481 to->sb_qflags = cpu_to_be16(qflags);
482 *fields &= ~XFS_SB_QFLAGS;
483 }
484
485 /*
486 * GQUOTINO and PQUOTINO cannot be used together in versions of
487 * superblock that do not have pquotino. from->sb_flags tells us which
488 * quota is active and should be copied to disk. If neither are active,
489 * make sure we write NULLFSINO to the sb_gquotino field as a quota
490 * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
491 * bit is set.
492 *
493 * Note that we don't need to handle the sb_uquotino or sb_pquotino here
494 * as they do not require any translation. Hence the main sb field loop
495 * will write them appropriately from the in-core superblock.
496 */
497 if ((*fields & XFS_SB_GQUOTINO) &&
498 (from->sb_qflags & XFS_GQUOTA_ACCT))
499 to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
500 else if ((*fields & XFS_SB_PQUOTINO) &&
501 (from->sb_qflags & XFS_PQUOTA_ACCT))
502 to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
503 else {
504 /*
505 * We can't rely on just the fields being logged to tell us
506 * that it is safe to write NULLFSINO - we should only do that
507 * if quotas are not actually enabled. Hence only write
508 * NULLFSINO if both in-core quota inodes are NULL.
509 */
510 if (from->sb_gquotino == NULLFSINO &&
511 from->sb_pquotino == NULLFSINO)
512 to->sb_gquotino = cpu_to_be64(NULLFSINO);
513 }
514
515 *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
516}
517
518/*
519 * Copy in core superblock to ondisk one.
520 *
521 * The fields argument is mask of superblock fields to copy.
522 */
523void
524xfs_sb_to_disk(
525 xfs_dsb_t *to,
526 xfs_sb_t *from,
527 __int64_t fields)
528{
529 xfs_caddr_t to_ptr = (xfs_caddr_t)to;
530 xfs_caddr_t from_ptr = (xfs_caddr_t)from;
531 xfs_sb_field_t f;
532 int first;
533 int size;
534
535 ASSERT(fields);
536 if (!fields)
537 return;
538
539 xfs_sb_quota_to_disk(to, from, &fields);
540 while (fields) {
541 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
542 first = xfs_sb_info[f].offset;
543 size = xfs_sb_info[f + 1].offset - first;
544
545 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
546
547 if (size == 1 || xfs_sb_info[f].type == 1) {
548 memcpy(to_ptr + first, from_ptr + first, size);
549 } else {
550 switch (size) {
551 case 2:
552 *(__be16 *)(to_ptr + first) =
553 cpu_to_be16(*(__u16 *)(from_ptr + first));
554 break;
555 case 4:
556 *(__be32 *)(to_ptr + first) =
557 cpu_to_be32(*(__u32 *)(from_ptr + first));
558 break;
559 case 8:
560 *(__be64 *)(to_ptr + first) =
561 cpu_to_be64(*(__u64 *)(from_ptr + first));
562 break;
563 default:
564 ASSERT(0);
565 }
566 }
567
568 fields &= ~(1LL << f);
569 }
570}
571
572static int
573xfs_sb_verify(
574 struct xfs_buf *bp,
575 bool check_version)
576{
577 struct xfs_mount *mp = bp->b_target->bt_mount;
578 struct xfs_sb sb;
579
580 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
581
582 /*
583 * Only check the in progress field for the primary superblock as
584 * mkfs.xfs doesn't clear it from secondary superblocks.
585 */
586 return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
587 check_version);
588}
589
590/*
591 * If the superblock has the CRC feature bit set or the CRC field is non-null,
592 * check that the CRC is valid. We check the CRC field is non-null because a
593 * single bit error could clear the feature bit and unused parts of the
594 * superblock are supposed to be zero. Hence a non-null crc field indicates that
595 * we've potentially lost a feature bit and we should check it anyway.
596 *
597 * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
598 * last field in V4 secondary superblocks. So for secondary superblocks,
599 * we are more forgiving, and ignore CRC failures if the primary doesn't
600 * indicate that the fs version is V5.
601 */
602static void
603xfs_sb_read_verify(
604 struct xfs_buf *bp)
605{
606 struct xfs_mount *mp = bp->b_target->bt_mount;
607 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
608 int error;
609
610 /*
611 * open code the version check to avoid needing to convert the entire
612 * superblock from disk order just to check the version number
613 */
614 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
615 (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
616 XFS_SB_VERSION_5) ||
617 dsb->sb_crc != 0)) {
618
619 if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
620 /* Only fail bad secondaries on a known V5 filesystem */
621 if (bp->b_bn == XFS_SB_DADDR ||
622 xfs_sb_version_hascrc(&mp->m_sb)) {
623 error = -EFSBADCRC;
624 goto out_error;
625 }
626 }
627 }
628 error = xfs_sb_verify(bp, true);
629
630out_error:
631 if (error) {
632 xfs_buf_ioerror(bp, error);
633 if (error == -EFSCORRUPTED || error == -EFSBADCRC)
634 xfs_verifier_error(bp);
635 }
636}
637
638/*
639 * We may be probed for a filesystem match, so we may not want to emit
640 * messages when the superblock buffer is not actually an XFS superblock.
641 * If we find an XFS superblock, then run a normal, noisy mount because we are
642 * really going to mount it and want to know about errors.
643 */
644static void
645xfs_sb_quiet_read_verify(
646 struct xfs_buf *bp)
647{
648 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
649
650 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
651 /* XFS filesystem, verify noisily! */
652 xfs_sb_read_verify(bp);
653 return;
654 }
655 /* quietly fail */
656 xfs_buf_ioerror(bp, -EWRONGFS);
657}
658
659static void
660xfs_sb_write_verify(
661 struct xfs_buf *bp)
662{
663 struct xfs_mount *mp = bp->b_target->bt_mount;
664 struct xfs_buf_log_item *bip = bp->b_fspriv;
665 int error;
666
667 error = xfs_sb_verify(bp, false);
668 if (error) {
669 xfs_buf_ioerror(bp, error);
670 xfs_verifier_error(bp);
671 return;
672 }
673
674 if (!xfs_sb_version_hascrc(&mp->m_sb))
675 return;
676
677 if (bip)
678 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
679
680 xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
681}
682
683const struct xfs_buf_ops xfs_sb_buf_ops = {
684 .verify_read = xfs_sb_read_verify,
685 .verify_write = xfs_sb_write_verify,
686};
687
688const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
689 .verify_read = xfs_sb_quiet_read_verify,
690 .verify_write = xfs_sb_write_verify,
691};
692
693/*
694 * xfs_mount_common
695 *
696 * Mount initialization code establishing various mount
697 * fields from the superblock associated with the given
698 * mount structure
699 */
700void
701xfs_sb_mount_common(
702 struct xfs_mount *mp,
703 struct xfs_sb *sbp)
704{
705 mp->m_agfrotor = mp->m_agirotor = 0;
706 spin_lock_init(&mp->m_agirotor_lock);
707 mp->m_maxagi = mp->m_sb.sb_agcount;
708 mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
709 mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
710 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
711 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
712 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
713 mp->m_blockmask = sbp->sb_blocksize - 1;
714 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
715 mp->m_blockwmask = mp->m_blockwsize - 1;
716
717 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
718 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
719 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
720 mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
721
722 mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
723 mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
724 mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
725 mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
726
727 mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
728 mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
729 mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
730 mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
731
732 mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
733 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
734 sbp->sb_inopblock);
735 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
736}
737
738/*
739 * xfs_initialize_perag_data
740 *
741 * Read in each per-ag structure so we can count up the number of
742 * allocated inodes, free inodes and used filesystem blocks as this
743 * information is no longer persistent in the superblock. Once we have
744 * this information, write it into the in-core superblock structure.
745 */
746int
747xfs_initialize_perag_data(
748 struct xfs_mount *mp,
749 xfs_agnumber_t agcount)
750{
751 xfs_agnumber_t index;
752 xfs_perag_t *pag;
753 xfs_sb_t *sbp = &mp->m_sb;
754 uint64_t ifree = 0;
755 uint64_t ialloc = 0;
756 uint64_t bfree = 0;
757 uint64_t bfreelst = 0;
758 uint64_t btree = 0;
759 int error;
760
761 for (index = 0; index < agcount; index++) {
762 /*
763 * read the agf, then the agi. This gets us
764 * all the information we need and populates the
765 * per-ag structures for us.
766 */
767 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
768 if (error)
769 return error;
770
771 error = xfs_ialloc_pagi_init(mp, NULL, index);
772 if (error)
773 return error;
774 pag = xfs_perag_get(mp, index);
775 ifree += pag->pagi_freecount;
776 ialloc += pag->pagi_count;
777 bfree += pag->pagf_freeblks;
778 bfreelst += pag->pagf_flcount;
779 btree += pag->pagf_btreeblks;
780 xfs_perag_put(pag);
781 }
782 /*
783 * Overwrite incore superblock counters with just-read data
784 */
785 spin_lock(&mp->m_sb_lock);
786 sbp->sb_ifree = ifree;
787 sbp->sb_icount = ialloc;
788 sbp->sb_fdblocks = bfree + bfreelst + btree;
789 spin_unlock(&mp->m_sb_lock);
790
791 /* Fixup the per-cpu counters as well. */
792 xfs_icsb_reinit_counters(mp);
793
794 return 0;
795}
796
797/*
798 * xfs_mod_sb() can be used to copy arbitrary changes to the
799 * in-core superblock into the superblock buffer to be logged.
800 * It does not provide the higher level of locking that is
801 * needed to protect the in-core superblock from concurrent
802 * access.
803 */
804void
805xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
806{
807 xfs_buf_t *bp;
808 int first;
809 int last;
810 xfs_mount_t *mp;
811 xfs_sb_field_t f;
812
813 ASSERT(fields);
814 if (!fields)
815 return;
816 mp = tp->t_mountp;
817 bp = xfs_trans_getsb(tp, mp, 0);
818 first = sizeof(xfs_sb_t);
819 last = 0;
820
821 /* translate/copy */
822
823 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
824
825 /* find modified range */
826 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
827 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
828 last = xfs_sb_info[f + 1].offset - 1;
829
830 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
831 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
832 first = xfs_sb_info[f].offset;
833
834 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
835 xfs_trans_log_buf(tp, bp, first, last);
836}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
new file mode 100644
index 000000000000..c43c2d609a24
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -0,0 +1,621 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SB_H__
19#define __XFS_SB_H__
20
21/*
22 * Super block
23 * Fits into a sector-sized buffer at address 0 of each allocation group.
24 * Only the first of these is ever updated except during growfs.
25 */
26
27struct xfs_buf;
28struct xfs_mount;
29struct xfs_trans;
30
31#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
32#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
33#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
34#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
35#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
36#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
37#define XFS_SB_VERSION_NUMBITS 0x000f
38#define XFS_SB_VERSION_ALLFBITS 0xfff0
39#define XFS_SB_VERSION_ATTRBIT 0x0010
40#define XFS_SB_VERSION_NLINKBIT 0x0020
41#define XFS_SB_VERSION_QUOTABIT 0x0040
42#define XFS_SB_VERSION_ALIGNBIT 0x0080
43#define XFS_SB_VERSION_DALIGNBIT 0x0100
44#define XFS_SB_VERSION_SHAREDBIT 0x0200
45#define XFS_SB_VERSION_LOGV2BIT 0x0400
46#define XFS_SB_VERSION_SECTORBIT 0x0800
47#define XFS_SB_VERSION_EXTFLGBIT 0x1000
48#define XFS_SB_VERSION_DIRV2BIT 0x2000
49#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
50#define XFS_SB_VERSION_MOREBITSBIT 0x8000
51
52/*
53 * Supported feature bit list is just all bits in the versionnum field because
54 * we've used them all up and understand them all. Except, of course, for the
55 * shared superblock bit, which nobody knows what it does and so is unsupported.
56 */
57#define XFS_SB_VERSION_OKBITS \
58 ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
59 ~XFS_SB_VERSION_SHAREDBIT)
60
61/*
62 * There are two words to hold XFS "feature" bits: the original
63 * word, sb_versionnum, and sb_features2. Whenever a bit is set in
64 * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
65 *
66 * These defines represent bits in sb_features2.
67 */
68#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
69#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
70#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
71#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
72#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
73#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
74#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
75#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
76
77#define XFS_SB_VERSION2_OKBITS \
78 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
79 XFS_SB_VERSION2_ATTR2BIT | \
80 XFS_SB_VERSION2_PROJID32BIT | \
81 XFS_SB_VERSION2_FTYPE)
82
83/*
84 * Superblock - in core version. Must match the ondisk version below.
85 * Must be padded to 64 bit alignment.
86 */
87typedef struct xfs_sb {
88 __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
89 __uint32_t sb_blocksize; /* logical block size, bytes */
90 xfs_drfsbno_t sb_dblocks; /* number of data blocks */
91 xfs_drfsbno_t sb_rblocks; /* number of realtime blocks */
92 xfs_drtbno_t sb_rextents; /* number of realtime extents */
93 uuid_t sb_uuid; /* file system unique id */
94 xfs_dfsbno_t sb_logstart; /* starting block of log if internal */
95 xfs_ino_t sb_rootino; /* root inode number */
96 xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
97 xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
98 xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */
99 xfs_agblock_t sb_agblocks; /* size of an allocation group */
100 xfs_agnumber_t sb_agcount; /* number of allocation groups */
101 xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */
102 xfs_extlen_t sb_logblocks; /* number of log blocks */
103 __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
104 __uint16_t sb_sectsize; /* volume sector size, bytes */
105 __uint16_t sb_inodesize; /* inode size, bytes */
106 __uint16_t sb_inopblock; /* inodes per block */
107 char sb_fname[12]; /* file system name */
108 __uint8_t sb_blocklog; /* log2 of sb_blocksize */
109 __uint8_t sb_sectlog; /* log2 of sb_sectsize */
110 __uint8_t sb_inodelog; /* log2 of sb_inodesize */
111 __uint8_t sb_inopblog; /* log2 of sb_inopblock */
112 __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
113 __uint8_t sb_rextslog; /* log2 of sb_rextents */
114 __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
115 __uint8_t sb_imax_pct; /* max % of fs for inode space */
116 /* statistics */
117 /*
118 * These fields must remain contiguous. If you really
119 * want to change their layout, make sure you fix the
120 * code in xfs_trans_apply_sb_deltas().
121 */
122 __uint64_t sb_icount; /* allocated inodes */
123 __uint64_t sb_ifree; /* free inodes */
124 __uint64_t sb_fdblocks; /* free data blocks */
125 __uint64_t sb_frextents; /* free realtime extents */
126 /*
127 * End contiguous fields.
128 */
129 xfs_ino_t sb_uquotino; /* user quota inode */
130 xfs_ino_t sb_gquotino; /* group quota inode */
131 __uint16_t sb_qflags; /* quota flags */
132 __uint8_t sb_flags; /* misc. flags */
133 __uint8_t sb_shared_vn; /* shared version number */
134 xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */
135 __uint32_t sb_unit; /* stripe or raid unit */
136 __uint32_t sb_width; /* stripe or raid width */
137 __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
138 __uint8_t sb_logsectlog; /* log2 of the log sector size */
139 __uint16_t sb_logsectsize; /* sector size for the log, bytes */
140 __uint32_t sb_logsunit; /* stripe unit size for the log */
141 __uint32_t sb_features2; /* additional feature bits */
142
143 /*
144 * bad features2 field as a result of failing to pad the sb
145 * structure to 64 bits. Some machines will be using this field
146 * for features2 bits. Easiest just to mark it bad and not use
147 * it for anything else.
148 */
149 __uint32_t sb_bad_features2;
150
151 /* version 5 superblock fields start here */
152
153 /* feature masks */
154 __uint32_t sb_features_compat;
155 __uint32_t sb_features_ro_compat;
156 __uint32_t sb_features_incompat;
157 __uint32_t sb_features_log_incompat;
158
159 __uint32_t sb_crc; /* superblock crc */
160 __uint32_t sb_pad;
161
162 xfs_ino_t sb_pquotino; /* project quota inode */
163 xfs_lsn_t sb_lsn; /* last write sequence */
164
165 /* must be padded to 64 bit alignment */
166} xfs_sb_t;
167
168#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
169
170/*
171 * Superblock - on disk version. Must match the in core version above.
172 * Must be padded to 64 bit alignment.
173 */
174typedef struct xfs_dsb {
175 __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
176 __be32 sb_blocksize; /* logical block size, bytes */
177 __be64 sb_dblocks; /* number of data blocks */
178 __be64 sb_rblocks; /* number of realtime blocks */
179 __be64 sb_rextents; /* number of realtime extents */
180 uuid_t sb_uuid; /* file system unique id */
181 __be64 sb_logstart; /* starting block of log if internal */
182 __be64 sb_rootino; /* root inode number */
183 __be64 sb_rbmino; /* bitmap inode for realtime extents */
184 __be64 sb_rsumino; /* summary inode for rt bitmap */
185 __be32 sb_rextsize; /* realtime extent size, blocks */
186 __be32 sb_agblocks; /* size of an allocation group */
187 __be32 sb_agcount; /* number of allocation groups */
188 __be32 sb_rbmblocks; /* number of rt bitmap blocks */
189 __be32 sb_logblocks; /* number of log blocks */
190 __be16 sb_versionnum; /* header version == XFS_SB_VERSION */
191 __be16 sb_sectsize; /* volume sector size, bytes */
192 __be16 sb_inodesize; /* inode size, bytes */
193 __be16 sb_inopblock; /* inodes per block */
194 char sb_fname[12]; /* file system name */
195 __u8 sb_blocklog; /* log2 of sb_blocksize */
196 __u8 sb_sectlog; /* log2 of sb_sectsize */
197 __u8 sb_inodelog; /* log2 of sb_inodesize */
198 __u8 sb_inopblog; /* log2 of sb_inopblock */
199 __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */
200 __u8 sb_rextslog; /* log2 of sb_rextents */
201 __u8 sb_inprogress; /* mkfs is in progress, don't mount */
202 __u8 sb_imax_pct; /* max % of fs for inode space */
203 /* statistics */
204 /*
205 * These fields must remain contiguous. If you really
206 * want to change their layout, make sure you fix the
207 * code in xfs_trans_apply_sb_deltas().
208 */
209 __be64 sb_icount; /* allocated inodes */
210 __be64 sb_ifree; /* free inodes */
211 __be64 sb_fdblocks; /* free data blocks */
212 __be64 sb_frextents; /* free realtime extents */
213 /*
214 * End contiguous fields.
215 */
216 __be64 sb_uquotino; /* user quota inode */
217 __be64 sb_gquotino; /* group quota inode */
218 __be16 sb_qflags; /* quota flags */
219 __u8 sb_flags; /* misc. flags */
220 __u8 sb_shared_vn; /* shared version number */
221 __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */
222 __be32 sb_unit; /* stripe or raid unit */
223 __be32 sb_width; /* stripe or raid width */
224 __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */
225 __u8 sb_logsectlog; /* log2 of the log sector size */
226 __be16 sb_logsectsize; /* sector size for the log, bytes */
227 __be32 sb_logsunit; /* stripe unit size for the log */
228 __be32 sb_features2; /* additional feature bits */
229 /*
230 * bad features2 field as a result of failing to pad the sb
231 * structure to 64 bits. Some machines will be using this field
232 * for features2 bits. Easiest just to mark it bad and not use
233 * it for anything else.
234 */
235 __be32 sb_bad_features2;
236
237 /* version 5 superblock fields start here */
238
239 /* feature masks */
240 __be32 sb_features_compat;
241 __be32 sb_features_ro_compat;
242 __be32 sb_features_incompat;
243 __be32 sb_features_log_incompat;
244
245 __le32 sb_crc; /* superblock crc */
246 __be32 sb_pad;
247
248 __be64 sb_pquotino; /* project quota inode */
249 __be64 sb_lsn; /* last write sequence */
250
251 /* must be padded to 64 bit alignment */
252} xfs_dsb_t;
253
254/*
255 * Sequence number values for the fields.
256 */
257typedef enum {
258 XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
259 XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
260 XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
261 XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
262 XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
263 XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
264 XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
265 XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
266 XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
267 XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
268 XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
269 XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
270 XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
271 XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
272 XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
273 XFS_SBS_PQUOTINO, XFS_SBS_LSN,
274 XFS_SBS_FIELDCOUNT
275} xfs_sb_field_t;
276
277/*
278 * Mask values, defined based on the xfs_sb_field_t values.
279 * Only define the ones we're using.
280 */
281#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x)
282#define XFS_SB_UUID XFS_SB_MVAL(UUID)
283#define XFS_SB_FNAME XFS_SB_MVAL(FNAME)
284#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO)
285#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO)
286#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO)
287#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM)
288#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO)
289#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO)
290#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS)
291#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
292#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
293#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
294#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
295#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
296#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
297#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2)
298#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2)
299#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
300#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
301#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
302#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
303#define XFS_SB_CRC XFS_SB_MVAL(CRC)
304#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO)
305#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
306#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
307#define XFS_SB_MOD_BITS \
308 (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
309 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
310 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
311 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
312 XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
313 XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
314 XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
315
316
317/*
318 * Misc. Flags - warning - these will be cleared by xfs_repair unless
319 * a feature bit is set when the flag is used.
320 */
321#define XFS_SBF_NOFLAGS 0x00 /* no flags set */
322#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */
323
324/*
325 * define max. shared version we can interoperate with
326 */
327#define XFS_SB_MAX_SHARED_VN 0
328
329#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
330
331/*
332 * The first XFS version we support is a v4 superblock with V2 directories.
333 */
334static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
335{
336 if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
337 return false;
338
339 /* check for unknown features in the fs */
340 if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
341 ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
342 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
343 return false;
344
345 return true;
346}
347
348static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
349{
350 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
351 return true;
352 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
353 return xfs_sb_good_v4_features(sbp);
354 return false;
355}
356
357/*
358 * Detect a mismatched features2 field. Older kernels read/wrote
359 * this into the wrong slot, so to be safe we keep them in sync.
360 */
361static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
362{
363 return sbp->sb_bad_features2 != sbp->sb_features2;
364}
365
366static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
367{
368 return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
369}
370
371static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
372{
373 sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
374}
375
376static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
377{
378 return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
379}
380
381static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
382{
383 sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
384}
385
386static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
387{
388 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
389 (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
390}
391
392static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
393{
394 return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
395}
396
397static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
398{
399 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
400 (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
401}
402
403static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
404{
405 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
406 (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
407}
408
409static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
410{
411 return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
412}
413
414static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
415{
416 return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
417}
418
419static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
420{
421 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
422 (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
423}
424
425/*
426 * sb_features2 bit version macros.
427 */
428static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
429{
430 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
431 (xfs_sb_version_hasmorebits(sbp) &&
432 (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
433}
434
435static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
436{
437 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
438 (xfs_sb_version_hasmorebits(sbp) &&
439 (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
440}
441
442static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
443{
444 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
445 sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
446 sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
447}
448
449static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
450{
451 sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
452 sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
453 if (!sbp->sb_features2)
454 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
455}
456
457static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
458{
459 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
460 (xfs_sb_version_hasmorebits(sbp) &&
461 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
462}
463
464static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
465{
466 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
467 sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
468 sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
469}
470
471/*
472 * Extended v5 superblock feature masks. These are to be used for new v5
473 * superblock features only.
474 *
475 * Compat features are new features that old kernels will not notice or affect
476 * and so can mount read-write without issues.
477 *
478 * RO-Compat (read only) are features that old kernels can read but will break
479 * if they write. Hence only read-only mounts of such filesystems are allowed on
480 * kernels that don't support the feature bit.
481 *
482 * InCompat features are features which old kernels will not understand and so
483 * must not mount.
484 *
485 * Log-InCompat features are for changes to log formats or new transactions that
486 * can't be replayed on older kernels. The fields are set when the filesystem is
487 * mounted, and a clean unmount clears the fields.
488 */
489#define XFS_SB_FEAT_COMPAT_ALL 0
490#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
491static inline bool
492xfs_sb_has_compat_feature(
493 struct xfs_sb *sbp,
494 __uint32_t feature)
495{
496 return (sbp->sb_features_compat & feature) != 0;
497}
498
499#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
500#define XFS_SB_FEAT_RO_COMPAT_ALL \
501 (XFS_SB_FEAT_RO_COMPAT_FINOBT)
502#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
503static inline bool
504xfs_sb_has_ro_compat_feature(
505 struct xfs_sb *sbp,
506 __uint32_t feature)
507{
508 return (sbp->sb_features_ro_compat & feature) != 0;
509}
510
511#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
512#define XFS_SB_FEAT_INCOMPAT_ALL \
513 (XFS_SB_FEAT_INCOMPAT_FTYPE)
514
515#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
516static inline bool
517xfs_sb_has_incompat_feature(
518 struct xfs_sb *sbp,
519 __uint32_t feature)
520{
521 return (sbp->sb_features_incompat & feature) != 0;
522}
523
524#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
525#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
526static inline bool
527xfs_sb_has_incompat_log_feature(
528 struct xfs_sb *sbp,
529 __uint32_t feature)
530{
531 return (sbp->sb_features_log_incompat & feature) != 0;
532}
533
534/*
535 * V5 superblock specific feature checks
536 */
537static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
538{
539 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
540}
541
542static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
543{
544 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
545}
546
547static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
548{
549 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
550 xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
551 (xfs_sb_version_hasmorebits(sbp) &&
552 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
553}
554
555static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
556{
557 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
558 (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
559}
560
561/*
562 * end of superblock version macros
563 */
564
565static inline bool
566xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
567{
568 return (ino == sbp->sb_uquotino ||
569 ino == sbp->sb_gquotino ||
570 ino == sbp->sb_pquotino);
571}
572
573#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
574#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
575#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
576
577#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
578#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
579 xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
580#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \
581 XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
582
583/*
584 * File system sector to basic block conversions.
585 */
586#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log)
587
588/*
589 * File system block to basic block conversions.
590 */
591#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
592#define XFS_BB_TO_FSB(mp,bb) \
593 (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
594#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
595
596/*
597 * File system block to byte conversions.
598 */
599#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
600#define XFS_B_TO_FSB(mp,b) \
601 ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
602#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
603#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
604
605/*
606 * perag get/put wrappers for ref counting
607 */
608extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
609extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
610 int tag);
611extern void xfs_perag_put(struct xfs_perag *pag);
612extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
613
614extern void xfs_sb_calc_crc(struct xfs_buf *);
615extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
616extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
617extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
618extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
619extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
620
621#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
new file mode 100644
index 000000000000..82404da2ca67
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -0,0 +1,246 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#ifndef __XFS_SHARED_H__
20#define __XFS_SHARED_H__
21
22/*
23 * Definitions shared between kernel and userspace that don't fit into any other
24 * header file that is shared with userspace.
25 */
26struct xfs_ifork;
27struct xfs_buf;
28struct xfs_buf_ops;
29struct xfs_mount;
30struct xfs_trans;
31struct xfs_inode;
32
33/*
34 * Buffer verifier operations are widely used, including userspace tools
35 */
36extern const struct xfs_buf_ops xfs_agf_buf_ops;
37extern const struct xfs_buf_ops xfs_agi_buf_ops;
38extern const struct xfs_buf_ops xfs_agf_buf_ops;
39extern const struct xfs_buf_ops xfs_agfl_buf_ops;
40extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
41extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
42extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
43extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
44extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
45extern const struct xfs_buf_ops xfs_dquot_buf_ops;
46extern const struct xfs_buf_ops xfs_symlink_buf_ops;
47extern const struct xfs_buf_ops xfs_agi_buf_ops;
48extern const struct xfs_buf_ops xfs_inobt_buf_ops;
49extern const struct xfs_buf_ops xfs_inode_buf_ops;
50extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
51extern const struct xfs_buf_ops xfs_dquot_buf_ops;
52extern const struct xfs_buf_ops xfs_sb_buf_ops;
53extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
54extern const struct xfs_buf_ops xfs_symlink_buf_ops;
55
56/*
57 * Transaction types. Used to distinguish types of buffers. These never reach
58 * the log.
59 */
60#define XFS_TRANS_SETATTR_NOT_SIZE 1
61#define XFS_TRANS_SETATTR_SIZE 2
62#define XFS_TRANS_INACTIVE 3
63#define XFS_TRANS_CREATE 4
64#define XFS_TRANS_CREATE_TRUNC 5
65#define XFS_TRANS_TRUNCATE_FILE 6
66#define XFS_TRANS_REMOVE 7
67#define XFS_TRANS_LINK 8
68#define XFS_TRANS_RENAME 9
69#define XFS_TRANS_MKDIR 10
70#define XFS_TRANS_RMDIR 11
71#define XFS_TRANS_SYMLINK 12
72#define XFS_TRANS_SET_DMATTRS 13
73#define XFS_TRANS_GROWFS 14
74#define XFS_TRANS_STRAT_WRITE 15
75#define XFS_TRANS_DIOSTRAT 16
76/* 17 was XFS_TRANS_WRITE_SYNC */
77#define XFS_TRANS_WRITEID 18
78#define XFS_TRANS_ADDAFORK 19
79#define XFS_TRANS_ATTRINVAL 20
80#define XFS_TRANS_ATRUNCATE 21
81#define XFS_TRANS_ATTR_SET 22
82#define XFS_TRANS_ATTR_RM 23
83#define XFS_TRANS_ATTR_FLAG 24
84#define XFS_TRANS_CLEAR_AGI_BUCKET 25
85#define XFS_TRANS_QM_SBCHANGE 26
86/*
87 * Dummy entries since we use the transaction type to index into the
88 * trans_type[] in xlog_recover_print_trans_head()
89 */
90#define XFS_TRANS_DUMMY1 27
91#define XFS_TRANS_DUMMY2 28
92#define XFS_TRANS_QM_QUOTAOFF 29
93#define XFS_TRANS_QM_DQALLOC 30
94#define XFS_TRANS_QM_SETQLIM 31
95#define XFS_TRANS_QM_DQCLUSTER 32
96#define XFS_TRANS_QM_QINOCREATE 33
97#define XFS_TRANS_QM_QUOTAOFF_END 34
98#define XFS_TRANS_SB_UNIT 35
99#define XFS_TRANS_FSYNC_TS 36
100#define XFS_TRANS_GROWFSRT_ALLOC 37
101#define XFS_TRANS_GROWFSRT_ZERO 38
102#define XFS_TRANS_GROWFSRT_FREE 39
103#define XFS_TRANS_SWAPEXT 40
104#define XFS_TRANS_SB_COUNT 41
105#define XFS_TRANS_CHECKPOINT 42
106#define XFS_TRANS_ICREATE 43
107#define XFS_TRANS_CREATE_TMPFILE 44
108#define XFS_TRANS_TYPE_MAX 44
109/* new transaction types need to be reflected in xfs_logprint(8) */
110
111#define XFS_TRANS_TYPES \
112 { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
113 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
114 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
115 { XFS_TRANS_CREATE, "CREATE" }, \
116 { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
117 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
118 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
119 { XFS_TRANS_REMOVE, "REMOVE" }, \
120 { XFS_TRANS_LINK, "LINK" }, \
121 { XFS_TRANS_RENAME, "RENAME" }, \
122 { XFS_TRANS_MKDIR, "MKDIR" }, \
123 { XFS_TRANS_RMDIR, "RMDIR" }, \
124 { XFS_TRANS_SYMLINK, "SYMLINK" }, \
125 { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
126 { XFS_TRANS_GROWFS, "GROWFS" }, \
127 { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
128 { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
129 { XFS_TRANS_WRITEID, "WRITEID" }, \
130 { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
131 { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
132 { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
133 { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
134 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
135 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
136 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
137 { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
138 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
139 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
140 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
141 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
142 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
143 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
144 { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
145 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
146 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
147 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
148 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
149 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
150 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
151 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
152 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
153 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
154 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
155
156/*
157 * This structure is used to track log items associated with
158 * a transaction. It points to the log item and keeps some
159 * flags to track the state of the log item. It also tracks
160 * the amount of space needed to log the item it describes
161 * once we get to commit processing (see xfs_trans_commit()).
162 */
163struct xfs_log_item_desc {
164 struct xfs_log_item *lid_item;
165 struct list_head lid_trans;
166 unsigned char lid_flags;
167};
168
169#define XFS_LID_DIRTY 0x1
170
171/* log size calculation functions */
172int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
173int xfs_log_calc_minimum_size(struct xfs_mount *);
174
175
176/*
177 * Values for t_flags.
178 */
179#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
180#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
181#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
182#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
183#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
184#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
185#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
186 count in superblock */
187/*
188 * Values for call flags parameter.
189 */
190#define XFS_TRANS_RELEASE_LOG_RES 0x4
191#define XFS_TRANS_ABORT 0x8
192
193/*
194 * Field values for xfs_trans_mod_sb.
195 */
196#define XFS_TRANS_SB_ICOUNT 0x00000001
197#define XFS_TRANS_SB_IFREE 0x00000002
198#define XFS_TRANS_SB_FDBLOCKS 0x00000004
199#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
200#define XFS_TRANS_SB_FREXTENTS 0x00000010
201#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
202#define XFS_TRANS_SB_DBLOCKS 0x00000040
203#define XFS_TRANS_SB_AGCOUNT 0x00000080
204#define XFS_TRANS_SB_IMAXPCT 0x00000100
205#define XFS_TRANS_SB_REXTSIZE 0x00000200
206#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
207#define XFS_TRANS_SB_RBLOCKS 0x00000800
208#define XFS_TRANS_SB_REXTENTS 0x00001000
209#define XFS_TRANS_SB_REXTSLOG 0x00002000
210
211/*
212 * Here we centralize the specification of XFS meta-data buffer reference count
213 * values. This determines how hard the buffer cache tries to hold onto the
214 * buffer.
215 */
216#define XFS_AGF_REF 4
217#define XFS_AGI_REF 4
218#define XFS_AGFL_REF 3
219#define XFS_INO_BTREE_REF 3
220#define XFS_ALLOC_BTREE_REF 2
221#define XFS_BMAP_BTREE_REF 2
222#define XFS_DIR_BTREE_REF 2
223#define XFS_INO_REF 2
224#define XFS_ATTR_BTREE_REF 1
225#define XFS_DQUOT_REF 1
226
227/*
228 * Flags for xfs_trans_ichgtime().
229 */
230#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
231#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
232#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
233
234
235/*
236 * Symlink decoding/encoding functions
237 */
238int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
239int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
240 uint32_t size, struct xfs_buf *bp);
241bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
242 uint32_t size, struct xfs_buf *bp);
243void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
244 struct xfs_inode *ip, struct xfs_ifork *ifp);
245
246#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
new file mode 100644
index 000000000000..5782f037eab4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -0,0 +1,201 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * Copyright (c) 2012-2013 Red Hat, Inc.
4 * All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_log_format.h"
23#include "xfs_shared.h"
24#include "xfs_trans_resv.h"
25#include "xfs_ag.h"
26#include "xfs_sb.h"
27#include "xfs_mount.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_inode.h"
30#include "xfs_error.h"
31#include "xfs_trace.h"
32#include "xfs_symlink.h"
33#include "xfs_cksum.h"
34#include "xfs_trans.h"
35#include "xfs_buf_item.h"
36
37
38/*
39 * Each contiguous block has a header, so it is not just a simple pathlen
40 * to FSB conversion.
41 */
42int
43xfs_symlink_blocks(
44 struct xfs_mount *mp,
45 int pathlen)
46{
47 int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
48
49 return (pathlen + buflen - 1) / buflen;
50}
51
52int
53xfs_symlink_hdr_set(
54 struct xfs_mount *mp,
55 xfs_ino_t ino,
56 uint32_t offset,
57 uint32_t size,
58 struct xfs_buf *bp)
59{
60 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
61
62 if (!xfs_sb_version_hascrc(&mp->m_sb))
63 return 0;
64
65 dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
66 dsl->sl_offset = cpu_to_be32(offset);
67 dsl->sl_bytes = cpu_to_be32(size);
68 uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
69 dsl->sl_owner = cpu_to_be64(ino);
70 dsl->sl_blkno = cpu_to_be64(bp->b_bn);
71 bp->b_ops = &xfs_symlink_buf_ops;
72
73 return sizeof(struct xfs_dsymlink_hdr);
74}
75
76/*
77 * Checking of the symlink header is split into two parts. the verifier does
78 * CRC, location and bounds checking, the unpacking function checks the path
79 * parameters and owner.
80 */
81bool
82xfs_symlink_hdr_ok(
83 xfs_ino_t ino,
84 uint32_t offset,
85 uint32_t size,
86 struct xfs_buf *bp)
87{
88 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
89
90 if (offset != be32_to_cpu(dsl->sl_offset))
91 return false;
92 if (size != be32_to_cpu(dsl->sl_bytes))
93 return false;
94 if (ino != be64_to_cpu(dsl->sl_owner))
95 return false;
96
97 /* ok */
98 return true;
99}
100
101static bool
102xfs_symlink_verify(
103 struct xfs_buf *bp)
104{
105 struct xfs_mount *mp = bp->b_target->bt_mount;
106 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
107
108 if (!xfs_sb_version_hascrc(&mp->m_sb))
109 return false;
110 if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
111 return false;
112 if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
113 return false;
114 if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
115 return false;
116 if (be32_to_cpu(dsl->sl_offset) +
117 be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
118 return false;
119 if (dsl->sl_owner == 0)
120 return false;
121
122 return true;
123}
124
125static void
126xfs_symlink_read_verify(
127 struct xfs_buf *bp)
128{
129 struct xfs_mount *mp = bp->b_target->bt_mount;
130
131 /* no verification of non-crc buffers */
132 if (!xfs_sb_version_hascrc(&mp->m_sb))
133 return;
134
135 if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
136 xfs_buf_ioerror(bp, -EFSBADCRC);
137 else if (!xfs_symlink_verify(bp))
138 xfs_buf_ioerror(bp, -EFSCORRUPTED);
139
140 if (bp->b_error)
141 xfs_verifier_error(bp);
142}
143
144static void
145xfs_symlink_write_verify(
146 struct xfs_buf *bp)
147{
148 struct xfs_mount *mp = bp->b_target->bt_mount;
149 struct xfs_buf_log_item *bip = bp->b_fspriv;
150
151 /* no verification of non-crc buffers */
152 if (!xfs_sb_version_hascrc(&mp->m_sb))
153 return;
154
155 if (!xfs_symlink_verify(bp)) {
156 xfs_buf_ioerror(bp, -EFSCORRUPTED);
157 xfs_verifier_error(bp);
158 return;
159 }
160
161 if (bip) {
162 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
163 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
164 }
165 xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
166}
167
168const struct xfs_buf_ops xfs_symlink_buf_ops = {
169 .verify_read = xfs_symlink_read_verify,
170 .verify_write = xfs_symlink_write_verify,
171};
172
173void
174xfs_symlink_local_to_remote(
175 struct xfs_trans *tp,
176 struct xfs_buf *bp,
177 struct xfs_inode *ip,
178 struct xfs_ifork *ifp)
179{
180 struct xfs_mount *mp = ip->i_mount;
181 char *buf;
182
183 if (!xfs_sb_version_hascrc(&mp->m_sb)) {
184 bp->b_ops = NULL;
185 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
186 return;
187 }
188
189 /*
190 * As this symlink fits in an inode literal area, it must also fit in
191 * the smallest buffer the filesystem supports.
192 */
193 ASSERT(BBTOB(bp->b_length) >=
194 ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
195
196 bp->b_ops = &xfs_symlink_buf_ops;
197
198 buf = bp->b_addr;
199 buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
200 memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
201}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
new file mode 100644
index 000000000000..f2bda7c76b8a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -0,0 +1,894 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * Copyright (C) 2010 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_shared.h"
22#include "xfs_format.h"
23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_da_format.h"
29#include "xfs_da_btree.h"
30#include "xfs_inode.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_ialloc.h"
33#include "xfs_quota.h"
34#include "xfs_trans.h"
35#include "xfs_qm.h"
36#include "xfs_trans_space.h"
37#include "xfs_trace.h"
38
39/*
40 * A buffer has a format structure overhead in the log in addition
41 * to the data, so we need to take this into account when reserving
42 * space in a transaction for a buffer. Round the space required up
43 * to a multiple of 128 bytes so that we don't change the historical
44 * reservation that has been used for this overhead.
45 */
46STATIC uint
47xfs_buf_log_overhead(void)
48{
49 return round_up(sizeof(struct xlog_op_header) +
50 sizeof(struct xfs_buf_log_format), 128);
51}
52
53/*
54 * Calculate out transaction log reservation per item in bytes.
55 *
56 * The nbufs argument is used to indicate the number of items that
57 * will be changed in a transaction. size is used to tell how many
58 * bytes should be reserved per item.
59 */
60STATIC uint
61xfs_calc_buf_res(
62 uint nbufs,
63 uint size)
64{
65 return nbufs * (size + xfs_buf_log_overhead());
66}
67
68/*
69 * Logging inodes is really tricksy. They are logged in memory format,
70 * which means that what we write into the log doesn't directly translate into
71 * the amount of space they use on disk.
72 *
73 * Case in point - btree format forks in memory format use more space than the
74 * on-disk format. In memory, the buffer contains a normal btree block header so
75 * the btree code can treat it as though it is just another generic buffer.
76 * However, when we write it to the inode fork, we don't write all of this
77 * header as it isn't needed. e.g. the root is only ever in the inode, so
78 * there's no need for sibling pointers which would waste 16 bytes of space.
79 *
80 * Hence when we have an inode with a maximally sized btree format fork, then
81 * amount of information we actually log is greater than the size of the inode
82 * on disk. Hence we need an inode reservation function that calculates all this
83 * correctly. So, we log:
84 *
85 * - 4 log op headers for object
86 * - for the ilf, the inode core and 2 forks
87 * - inode log format object
88 * - the inode core
89 * - two inode forks containing bmap btree root blocks.
90 * - the btree data contained by both forks will fit into the inode size,
91 * hence when combined with the inode core above, we have a total of the
92 * actual inode size.
93 * - the BMBT headers need to be accounted separately, as they are
94 * additional to the records and pointers that fit inside the inode
95 * forks.
96 */
97STATIC uint
98xfs_calc_inode_res(
99 struct xfs_mount *mp,
100 uint ninodes)
101{
102 return ninodes *
103 (4 * sizeof(struct xlog_op_header) +
104 sizeof(struct xfs_inode_log_format) +
105 mp->m_sb.sb_inodesize +
106 2 * XFS_BMBT_BLOCK_LEN(mp));
107}
108
109/*
110 * The free inode btree is a conditional feature and the log reservation
111 * requirements differ slightly from that of the traditional inode allocation
112 * btree. The finobt tracks records for inode chunks with at least one free
113 * inode. A record can be removed from the tree for an inode allocation
114 * or free and thus the finobt reservation is unconditional across:
115 *
116 * - inode allocation
117 * - inode free
118 * - inode chunk allocation
119 *
120 * The 'modify' param indicates to include the record modification scenario. The
121 * 'alloc' param indicates to include the reservation for free space btree
122 * modifications on behalf of finobt modifications. This is required only for
123 * transactions that do not already account for free space btree modifications.
124 *
125 * the free inode btree: max depth * block size
126 * the allocation btrees: 2 trees * (max depth - 1) * block size
127 * the free inode btree entry: block size
128 */
129STATIC uint
130xfs_calc_finobt_res(
131 struct xfs_mount *mp,
132 int alloc,
133 int modify)
134{
135 uint res;
136
137 if (!xfs_sb_version_hasfinobt(&mp->m_sb))
138 return 0;
139
140 res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
141 if (alloc)
142 res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
143 XFS_FSB_TO_B(mp, 1));
144 if (modify)
145 res += (uint)XFS_FSB_TO_B(mp, 1);
146
147 return res;
148}
149
150/*
151 * Various log reservation values.
152 *
153 * These are based on the size of the file system block because that is what
154 * most transactions manipulate. Each adds in an additional 128 bytes per
155 * item logged to try to account for the overhead of the transaction mechanism.
156 *
157 * Note: Most of the reservations underestimate the number of allocation
158 * groups into which they could free extents in the xfs_bmap_finish() call.
159 * This is because the number in the worst case is quite high and quite
160 * unusual. In order to fix this we need to change xfs_bmap_finish() to free
161 * extents in only a single AG at a time. This will require changes to the
162 * EFI code as well, however, so that the EFI for the extents not freed is
163 * logged again in each transaction. See SGI PV #261917.
164 *
165 * Reservation functions here avoid a huge stack in xfs_trans_init due to
166 * register overflow from temporaries in the calculations.
167 */
168
169
170/*
171 * In a write transaction we can allocate a maximum of 2
172 * extents. This gives:
173 * the inode getting the new extents: inode size
174 * the inode's bmap btree: max depth * block size
175 * the agfs of the ags from which the extents are allocated: 2 * sector
176 * the superblock free block counter: sector size
177 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
178 * And the bmap_finish transaction can free bmap blocks in a join:
179 * the agfs of the ags containing the blocks: 2 * sector size
180 * the agfls of the ags containing the blocks: 2 * sector size
181 * the super block free block counter: sector size
182 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
183 */
184STATIC uint
185xfs_calc_write_reservation(
186 struct xfs_mount *mp)
187{
188 return XFS_DQUOT_LOGRES(mp) +
189 MAX((xfs_calc_inode_res(mp, 1) +
190 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
191 XFS_FSB_TO_B(mp, 1)) +
192 xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
193 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
194 XFS_FSB_TO_B(mp, 1))),
195 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
196 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
197 XFS_FSB_TO_B(mp, 1))));
198}
199
200/*
201 * In truncating a file we free up to two extents at once. We can modify:
202 * the inode being truncated: inode size
203 * the inode's bmap btree: (max depth + 1) * block size
204 * And the bmap_finish transaction can free the blocks and bmap blocks:
205 * the agf for each of the ags: 4 * sector size
206 * the agfl for each of the ags: 4 * sector size
207 * the super block to reflect the freed blocks: sector size
208 * worst case split in allocation btrees per extent assuming 4 extents:
209 * 4 exts * 2 trees * (2 * max depth - 1) * block size
210 * the inode btree: max depth * blocksize
211 * the allocation btrees: 2 trees * (max depth - 1) * block size
212 */
213STATIC uint
214xfs_calc_itruncate_reservation(
215 struct xfs_mount *mp)
216{
217 return XFS_DQUOT_LOGRES(mp) +
218 MAX((xfs_calc_inode_res(mp, 1) +
219 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
220 XFS_FSB_TO_B(mp, 1))),
221 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
222 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
223 XFS_FSB_TO_B(mp, 1)) +
224 xfs_calc_buf_res(5, 0) +
225 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
226 XFS_FSB_TO_B(mp, 1)) +
227 xfs_calc_buf_res(2 + mp->m_ialloc_blks +
228 mp->m_in_maxlevels, 0)));
229}
230
231/*
232 * In renaming a files we can modify:
233 * the four inodes involved: 4 * inode size
234 * the two directory btrees: 2 * (max depth + v2) * dir block size
235 * the two directory bmap btrees: 2 * max depth * block size
236 * And the bmap_finish transaction can free dir and bmap blocks (two sets
237 * of bmap blocks) giving:
238 * the agf for the ags in which the blocks live: 3 * sector size
239 * the agfl for the ags in which the blocks live: 3 * sector size
240 * the superblock for the free block count: sector size
241 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
242 */
243STATIC uint
244xfs_calc_rename_reservation(
245 struct xfs_mount *mp)
246{
247 return XFS_DQUOT_LOGRES(mp) +
248 MAX((xfs_calc_inode_res(mp, 4) +
249 xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
250 XFS_FSB_TO_B(mp, 1))),
251 (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
252 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
253 XFS_FSB_TO_B(mp, 1))));
254}
255
256/*
257 * For removing an inode from unlinked list at first, we can modify:
258 * the agi hash list and counters: sector size
259 * the on disk inode before ours in the agi hash list: inode cluster size
260 */
261STATIC uint
262xfs_calc_iunlink_remove_reservation(
263 struct xfs_mount *mp)
264{
265 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
266 max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
267}
268
269/*
270 * For creating a link to an inode:
271 * the parent directory inode: inode size
272 * the linked inode: inode size
273 * the directory btree could split: (max depth + v2) * dir block size
274 * the directory bmap btree could join or split: (max depth + v2) * blocksize
275 * And the bmap_finish transaction can free some bmap blocks giving:
276 * the agf for the ag in which the blocks live: sector size
277 * the agfl for the ag in which the blocks live: sector size
278 * the superblock for the free block count: sector size
279 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
280 */
281STATIC uint
282xfs_calc_link_reservation(
283 struct xfs_mount *mp)
284{
285 return XFS_DQUOT_LOGRES(mp) +
286 xfs_calc_iunlink_remove_reservation(mp) +
287 MAX((xfs_calc_inode_res(mp, 2) +
288 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
289 XFS_FSB_TO_B(mp, 1))),
290 (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
291 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
292 XFS_FSB_TO_B(mp, 1))));
293}
294
295/*
296 * For adding an inode to unlinked list we can modify:
297 * the agi hash list: sector size
298 * the unlinked inode: inode size
299 */
300STATIC uint
301xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
302{
303 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
304 xfs_calc_inode_res(mp, 1);
305}
306
307/*
308 * For removing a directory entry we can modify:
309 * the parent directory inode: inode size
310 * the removed inode: inode size
311 * the directory btree could join: (max depth + v2) * dir block size
312 * the directory bmap btree could join or split: (max depth + v2) * blocksize
313 * And the bmap_finish transaction can free the dir and bmap blocks giving:
314 * the agf for the ag in which the blocks live: 2 * sector size
315 * the agfl for the ag in which the blocks live: 2 * sector size
316 * the superblock for the free block count: sector size
317 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
318 */
319STATIC uint
320xfs_calc_remove_reservation(
321 struct xfs_mount *mp)
322{
323 return XFS_DQUOT_LOGRES(mp) +
324 xfs_calc_iunlink_add_reservation(mp) +
325 MAX((xfs_calc_inode_res(mp, 1) +
326 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
327 XFS_FSB_TO_B(mp, 1))),
328 (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
329 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
330 XFS_FSB_TO_B(mp, 1))));
331}
332
333/*
334 * For create, break it in to the two cases that the transaction
335 * covers. We start with the modify case - allocation done by modification
336 * of the state of existing inodes - and the allocation case.
337 */
338
339/*
340 * For create we can modify:
341 * the parent directory inode: inode size
342 * the new inode: inode size
343 * the inode btree entry: block size
344 * the superblock for the nlink flag: sector size
345 * the directory btree: (max depth + v2) * dir block size
346 * the directory inode's bmap btree: (max depth + v2) * block size
347 * the finobt (record modification and allocation btrees)
348 */
349STATIC uint
350xfs_calc_create_resv_modify(
351 struct xfs_mount *mp)
352{
353 return xfs_calc_inode_res(mp, 2) +
354 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
355 (uint)XFS_FSB_TO_B(mp, 1) +
356 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
357 xfs_calc_finobt_res(mp, 1, 1);
358}
359
360/*
361 * For create we can allocate some inodes giving:
362 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
363 * the superblock for the nlink flag: sector size
364 * the inode blocks allocated: mp->m_ialloc_blks * blocksize
365 * the inode btree: max depth * blocksize
366 * the allocation btrees: 2 trees * (max depth - 1) * block size
367 */
368STATIC uint
369xfs_calc_create_resv_alloc(
370 struct xfs_mount *mp)
371{
372 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
373 mp->m_sb.sb_sectsize +
374 xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
375 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
376 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
377 XFS_FSB_TO_B(mp, 1));
378}
379
380STATIC uint
381__xfs_calc_create_reservation(
382 struct xfs_mount *mp)
383{
384 return XFS_DQUOT_LOGRES(mp) +
385 MAX(xfs_calc_create_resv_alloc(mp),
386 xfs_calc_create_resv_modify(mp));
387}
388
389/*
390 * For icreate we can allocate some inodes giving:
391 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
392 * the superblock for the nlink flag: sector size
393 * the inode btree: max depth * blocksize
394 * the allocation btrees: 2 trees * (max depth - 1) * block size
395 * the finobt (record insertion)
396 */
397STATIC uint
398xfs_calc_icreate_resv_alloc(
399 struct xfs_mount *mp)
400{
401 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
402 mp->m_sb.sb_sectsize +
403 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
404 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
405 XFS_FSB_TO_B(mp, 1)) +
406 xfs_calc_finobt_res(mp, 0, 0);
407}
408
409STATIC uint
410xfs_calc_icreate_reservation(xfs_mount_t *mp)
411{
412 return XFS_DQUOT_LOGRES(mp) +
413 MAX(xfs_calc_icreate_resv_alloc(mp),
414 xfs_calc_create_resv_modify(mp));
415}
416
417STATIC uint
418xfs_calc_create_reservation(
419 struct xfs_mount *mp)
420{
421 if (xfs_sb_version_hascrc(&mp->m_sb))
422 return xfs_calc_icreate_reservation(mp);
423 return __xfs_calc_create_reservation(mp);
424
425}
426
427STATIC uint
428xfs_calc_create_tmpfile_reservation(
429 struct xfs_mount *mp)
430{
431 uint res = XFS_DQUOT_LOGRES(mp);
432
433 if (xfs_sb_version_hascrc(&mp->m_sb))
434 res += xfs_calc_icreate_resv_alloc(mp);
435 else
436 res += xfs_calc_create_resv_alloc(mp);
437
438 return res + xfs_calc_iunlink_add_reservation(mp);
439}
440
441/*
442 * Making a new directory is the same as creating a new file.
443 */
444STATIC uint
445xfs_calc_mkdir_reservation(
446 struct xfs_mount *mp)
447{
448 return xfs_calc_create_reservation(mp);
449}
450
451
452/*
453 * Making a new symplink is the same as creating a new file, but
454 * with the added blocks for remote symlink data which can be up to 1kB in
455 * length (MAXPATHLEN).
456 */
457STATIC uint
458xfs_calc_symlink_reservation(
459 struct xfs_mount *mp)
460{
461 return xfs_calc_create_reservation(mp) +
462 xfs_calc_buf_res(1, MAXPATHLEN);
463}
464
465/*
466 * In freeing an inode we can modify:
467 * the inode being freed: inode size
468 * the super block free inode counter: sector size
469 * the agi hash list and counters: sector size
470 * the inode btree entry: block size
471 * the on disk inode before ours in the agi hash list: inode cluster size
472 * the inode btree: max depth * blocksize
473 * the allocation btrees: 2 trees * (max depth - 1) * block size
474 * the finobt (record insertion, removal or modification)
475 */
476STATIC uint
477xfs_calc_ifree_reservation(
478 struct xfs_mount *mp)
479{
480 return XFS_DQUOT_LOGRES(mp) +
481 xfs_calc_inode_res(mp, 1) +
482 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
483 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
484 xfs_calc_iunlink_remove_reservation(mp) +
485 xfs_calc_buf_res(1, 0) +
486 xfs_calc_buf_res(2 + mp->m_ialloc_blks +
487 mp->m_in_maxlevels, 0) +
488 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
489 XFS_FSB_TO_B(mp, 1)) +
490 xfs_calc_finobt_res(mp, 0, 1);
491}
492
493/*
494 * When only changing the inode we log the inode and possibly the superblock
495 * We also add a bit of slop for the transaction stuff.
496 */
497STATIC uint
498xfs_calc_ichange_reservation(
499 struct xfs_mount *mp)
500{
501 return XFS_DQUOT_LOGRES(mp) +
502 xfs_calc_inode_res(mp, 1) +
503 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
504
505}
506
507/*
508 * Growing the data section of the filesystem.
509 * superblock
510 * agi and agf
511 * allocation btrees
512 */
513STATIC uint
514xfs_calc_growdata_reservation(
515 struct xfs_mount *mp)
516{
517 return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
518 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
519 XFS_FSB_TO_B(mp, 1));
520}
521
522/*
523 * Growing the rt section of the filesystem.
524 * In the first set of transactions (ALLOC) we allocate space to the
525 * bitmap or summary files.
526 * superblock: sector size
527 * agf of the ag from which the extent is allocated: sector size
528 * bmap btree for bitmap/summary inode: max depth * blocksize
529 * bitmap/summary inode: inode size
530 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
531 */
532STATIC uint
533xfs_calc_growrtalloc_reservation(
534 struct xfs_mount *mp)
535{
536 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
537 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
538 XFS_FSB_TO_B(mp, 1)) +
539 xfs_calc_inode_res(mp, 1) +
540 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
541 XFS_FSB_TO_B(mp, 1));
542}
543
544/*
545 * Growing the rt section of the filesystem.
546 * In the second set of transactions (ZERO) we zero the new metadata blocks.
547 * one bitmap/summary block: blocksize
548 */
549STATIC uint
550xfs_calc_growrtzero_reservation(
551 struct xfs_mount *mp)
552{
553 return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
554}
555
556/*
557 * Growing the rt section of the filesystem.
558 * In the third set of transactions (FREE) we update metadata without
559 * allocating any new blocks.
560 * superblock: sector size
561 * bitmap inode: inode size
562 * summary inode: inode size
563 * one bitmap block: blocksize
564 * summary blocks: new summary size
565 */
566STATIC uint
567xfs_calc_growrtfree_reservation(
568 struct xfs_mount *mp)
569{
570 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
571 xfs_calc_inode_res(mp, 2) +
572 xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
573 xfs_calc_buf_res(1, mp->m_rsumsize);
574}
575
576/*
577 * Logging the inode modification timestamp on a synchronous write.
578 * inode
579 */
580STATIC uint
581xfs_calc_swrite_reservation(
582 struct xfs_mount *mp)
583{
584 return xfs_calc_inode_res(mp, 1);
585}
586
587/*
588 * Logging the inode mode bits when writing a setuid/setgid file
589 * inode
590 */
591STATIC uint
592xfs_calc_writeid_reservation(
593 struct xfs_mount *mp)
594{
595 return xfs_calc_inode_res(mp, 1);
596}
597
598/*
599 * Converting the inode from non-attributed to attributed.
600 * the inode being converted: inode size
601 * agf block and superblock (for block allocation)
602 * the new block (directory sized)
603 * bmap blocks for the new directory block
604 * allocation btrees
605 */
606STATIC uint
607xfs_calc_addafork_reservation(
608 struct xfs_mount *mp)
609{
610 return XFS_DQUOT_LOGRES(mp) +
611 xfs_calc_inode_res(mp, 1) +
612 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
613 xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
614 xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
615 XFS_FSB_TO_B(mp, 1)) +
616 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
617 XFS_FSB_TO_B(mp, 1));
618}
619
620/*
621 * Removing the attribute fork of a file
622 * the inode being truncated: inode size
623 * the inode's bmap btree: max depth * block size
624 * And the bmap_finish transaction can free the blocks and bmap blocks:
625 * the agf for each of the ags: 4 * sector size
626 * the agfl for each of the ags: 4 * sector size
627 * the super block to reflect the freed blocks: sector size
628 * worst case split in allocation btrees per extent assuming 4 extents:
629 * 4 exts * 2 trees * (2 * max depth - 1) * block size
630 */
631STATIC uint
632xfs_calc_attrinval_reservation(
633 struct xfs_mount *mp)
634{
635 return MAX((xfs_calc_inode_res(mp, 1) +
636 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
637 XFS_FSB_TO_B(mp, 1))),
638 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
639 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
640 XFS_FSB_TO_B(mp, 1))));
641}
642
643/*
644 * Setting an attribute at mount time.
645 * the inode getting the attribute
646 * the superblock for allocations
647 * the agfs extents are allocated from
648 * the attribute btree * max depth
649 * the inode allocation btree
650 * Since attribute transaction space is dependent on the size of the attribute,
651 * the calculation is done partially at mount time and partially at runtime(see
652 * below).
653 */
654STATIC uint
655xfs_calc_attrsetm_reservation(
656 struct xfs_mount *mp)
657{
658 return XFS_DQUOT_LOGRES(mp) +
659 xfs_calc_inode_res(mp, 1) +
660 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
661 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
662}
663
664/*
665 * Setting an attribute at runtime, transaction space unit per block.
666 * the superblock for allocations: sector size
667 * the inode bmap btree could join or split: max depth * block size
668 * Since the runtime attribute transaction space is dependent on the total
669 * blocks needed for the 1st bmap, here we calculate out the space unit for
670 * one block so that the caller could figure out the total space according
671 * to the attibute extent length in blocks by:
672 * ext * M_RES(mp)->tr_attrsetrt.tr_logres
673 */
674STATIC uint
675xfs_calc_attrsetrt_reservation(
676 struct xfs_mount *mp)
677{
678 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
679 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
680 XFS_FSB_TO_B(mp, 1));
681}
682
683/*
684 * Removing an attribute.
685 * the inode: inode size
686 * the attribute btree could join: max depth * block size
687 * the inode bmap btree could join or split: max depth * block size
688 * And the bmap_finish transaction can free the attr blocks freed giving:
689 * the agf for the ag in which the blocks live: 2 * sector size
690 * the agfl for the ag in which the blocks live: 2 * sector size
691 * the superblock for the free block count: sector size
692 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
693 */
694STATIC uint
695xfs_calc_attrrm_reservation(
696 struct xfs_mount *mp)
697{
698 return XFS_DQUOT_LOGRES(mp) +
699 MAX((xfs_calc_inode_res(mp, 1) +
700 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
701 XFS_FSB_TO_B(mp, 1)) +
702 (uint)XFS_FSB_TO_B(mp,
703 XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
704 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
705 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
706 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
707 XFS_FSB_TO_B(mp, 1))));
708}
709
710/*
711 * Clearing a bad agino number in an agi hash bucket.
712 */
713STATIC uint
714xfs_calc_clear_agi_bucket_reservation(
715 struct xfs_mount *mp)
716{
717 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
718}
719
720/*
721 * Clearing the quotaflags in the superblock.
722 * the super block for changing quota flags: sector size
723 */
724STATIC uint
725xfs_calc_qm_sbchange_reservation(
726 struct xfs_mount *mp)
727{
728 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
729}
730
731/*
732 * Adjusting quota limits.
733 * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
734 */
735STATIC uint
736xfs_calc_qm_setqlim_reservation(
737 struct xfs_mount *mp)
738{
739 return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
740}
741
742/*
743 * Allocating quota on disk if needed.
744 * the write transaction log space for quota file extent allocation
745 * the unit of quota allocation: one system block size
746 */
747STATIC uint
748xfs_calc_qm_dqalloc_reservation(
749 struct xfs_mount *mp)
750{
751 return xfs_calc_write_reservation(mp) +
752 xfs_calc_buf_res(1,
753 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
754}
755
756/*
757 * Turning off quotas.
758 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
759 * the superblock for the quota flags: sector size
760 */
761STATIC uint
762xfs_calc_qm_quotaoff_reservation(
763 struct xfs_mount *mp)
764{
765 return sizeof(struct xfs_qoff_logitem) * 2 +
766 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
767}
768
769/*
770 * End of turning off quotas.
771 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
772 */
773STATIC uint
774xfs_calc_qm_quotaoff_end_reservation(
775 struct xfs_mount *mp)
776{
777 return sizeof(struct xfs_qoff_logitem) * 2;
778}
779
780/*
781 * Syncing the incore super block changes to disk.
782 * the super block to reflect the changes: sector size
783 */
784STATIC uint
785xfs_calc_sb_reservation(
786 struct xfs_mount *mp)
787{
788 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
789}
790
791void
792xfs_trans_resv_calc(
793 struct xfs_mount *mp,
794 struct xfs_trans_resv *resp)
795{
796 /*
797 * The following transactions are logged in physical format and
798 * require a permanent reservation on space.
799 */
800 resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
801 resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
802 resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
803
804 resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
805 resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
806 resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
807
808 resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
809 resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
810 resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
811
812 resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
813 resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
814 resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
815
816 resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
817 resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
818 resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
819
820 resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
821 resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
822 resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
823
824 resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
825 resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
826 resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
827
828 resp->tr_create_tmpfile.tr_logres =
829 xfs_calc_create_tmpfile_reservation(mp);
830 resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
831 resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
832
833 resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
834 resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
835 resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
836
837 resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
838 resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
839 resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
840
841 resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
842 resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
843 resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
844
845 resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
846 resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
847 resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
848
849 resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
850 resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
851 resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
852
853 resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
854 resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
855 resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
856
857 resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
858 resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
859 resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
860
861 resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
862 resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
863 resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
864
865 /*
866 * The following transactions are logged in logical format with
867 * a default log count.
868 */
869 resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
870 resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
871
872 resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
873 resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
874
875 resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
876 resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
877
878 resp->tr_qm_equotaoff.tr_logres =
879 xfs_calc_qm_quotaoff_end_reservation(mp);
880 resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
881
882 resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
883 resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
884
885 /* The following transaction are logged in logical format */
886 resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
887 resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
888 resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
889 resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
890 resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
891 resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
892 resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
893 resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
894}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
new file mode 100644
index 000000000000..1097d14cd583
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -0,0 +1,117 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_TRANS_RESV_H__
19#define __XFS_TRANS_RESV_H__
20
21struct xfs_mount;
22
23/*
24 * structure for maintaining pre-calculated transaction reservations.
25 */
26struct xfs_trans_res {
27 uint tr_logres; /* log space unit in bytes per log ticket */
28 int tr_logcount; /* number of log operations per log ticket */
29 int tr_logflags; /* log flags, currently only used for indicating
30 * a reservation request is permanent or not */
31};
32
33struct xfs_trans_resv {
34 struct xfs_trans_res tr_write; /* extent alloc trans */
35 struct xfs_trans_res tr_itruncate; /* truncate trans */
36 struct xfs_trans_res tr_rename; /* rename trans */
37 struct xfs_trans_res tr_link; /* link trans */
38 struct xfs_trans_res tr_remove; /* unlink trans */
39 struct xfs_trans_res tr_symlink; /* symlink trans */
40 struct xfs_trans_res tr_create; /* create trans */
41 struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */
42 struct xfs_trans_res tr_mkdir; /* mkdir trans */
43 struct xfs_trans_res tr_ifree; /* inode free trans */
44 struct xfs_trans_res tr_ichange; /* inode update trans */
45 struct xfs_trans_res tr_growdata; /* fs data section grow trans */
46 struct xfs_trans_res tr_addafork; /* add inode attr fork trans */
47 struct xfs_trans_res tr_writeid; /* write setuid/setgid file */
48 struct xfs_trans_res tr_attrinval; /* attr fork buffer
49 * invalidation */
50 struct xfs_trans_res tr_attrsetm; /* set/create an attribute at
51 * mount time */
52 struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at
53 * runtime */
54 struct xfs_trans_res tr_attrrm; /* remove an attribute */
55 struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */
56 struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */
57 struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */
58 struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */
59 struct xfs_trans_res tr_qm_sbchange; /* change quota flags */
60 struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */
61 struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
62 struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */
63 struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */
64 struct xfs_trans_res tr_sb; /* modify superblock */
65 struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
66};
67
68/* shorthand way of accessing reservation structure */
69#define M_RES(mp) (&(mp)->m_resv)
70
71/*
72 * Per-extent log reservation for the allocation btree changes
73 * involved in freeing or allocating an extent.
74 * 2 trees * (2 blocks/level * max depth - 1) * block size
75 */
76#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
77 ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
78#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
79 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
80
81/*
82 * Per-directory log reservation for any directory change.
83 * dir blocks: (1 btree block per level + data block + free block) * dblock size
84 * bmap btree: (levels + 2) * max depth * block size
85 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
86 * size, so account for that in the DAENTER macros.
87 */
88#define XFS_DIROP_LOG_RES(mp) \
89 (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
90 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
91#define XFS_DIROP_LOG_COUNT(mp) \
92 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
93 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
94
95/*
96 * Various log count values.
97 */
98#define XFS_DEFAULT_LOG_COUNT 1
99#define XFS_DEFAULT_PERM_LOG_COUNT 2
100#define XFS_ITRUNCATE_LOG_COUNT 2
101#define XFS_INACTIVE_LOG_COUNT 2
102#define XFS_CREATE_LOG_COUNT 2
103#define XFS_CREATE_TMPFILE_LOG_COUNT 2
104#define XFS_MKDIR_LOG_COUNT 3
105#define XFS_SYMLINK_LOG_COUNT 3
106#define XFS_REMOVE_LOG_COUNT 2
107#define XFS_LINK_LOG_COUNT 2
108#define XFS_RENAME_LOG_COUNT 2
109#define XFS_WRITE_LOG_COUNT 2
110#define XFS_ADDAFORK_LOG_COUNT 2
111#define XFS_ATTRINVAL_LOG_COUNT 1
112#define XFS_ATTRSET_LOG_COUNT 3
113#define XFS_ATTRRM_LOG_COUNT 3
114
115void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
116
117#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
new file mode 100644
index 000000000000..bf9c4579334d
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -0,0 +1,92 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_TRANS_SPACE_H__
19#define __XFS_TRANS_SPACE_H__
20
21/*
22 * Components of space reservations.
23 */
24#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \
25 (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
26#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1)
27#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
28 (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
29 XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
30 XFS_EXTENTADD_SPACE_RES(mp,w))
31#define XFS_DAENTER_1B(mp,w) \
32 ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
33#define XFS_DAENTER_DBS(mp,w) \
34 (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
35#define XFS_DAENTER_BLOCKS(mp,w) \
36 (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
37#define XFS_DAENTER_BMAP1B(mp,w) \
38 XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
39#define XFS_DAENTER_BMAPS(mp,w) \
40 (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
41#define XFS_DAENTER_SPACE_RES(mp,w) \
42 (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
43#define XFS_DAREMOVE_SPACE_RES(mp,w) XFS_DAENTER_BMAPS(mp,w)
44#define XFS_DIRENTER_MAX_SPLIT(mp,nl) 1
45#define XFS_DIRENTER_SPACE_RES(mp,nl) \
46 (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
47 XFS_DIRENTER_MAX_SPLIT(mp,nl))
48#define XFS_DIRREMOVE_SPACE_RES(mp) \
49 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
50#define XFS_IALLOC_SPACE_RES(mp) \
51 ((mp)->m_ialloc_blks + \
52 (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
53 ((mp)->m_in_maxlevels - 1)))
54
55/*
56 * Space reservation values for various transactions.
57 */
58#define XFS_ADDAFORK_SPACE_RES(mp) \
59 ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
60#define XFS_ATTRRM_SPACE_RES(mp) \
61 XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
62/* This macro is not used - see inline code in xfs_attr_set */
63#define XFS_ATTRSET_SPACE_RES(mp, v) \
64 (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
65#define XFS_CREATE_SPACE_RES(mp,nl) \
66 (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
67#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
68 (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
69#define XFS_GROWFS_SPACE_RES(mp) \
70 (2 * XFS_AG_MAXLEVELS(mp))
71#define XFS_GROWFSRT_SPACE_RES(mp,b) \
72 ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
73#define XFS_LINK_SPACE_RES(mp,nl) \
74 XFS_DIRENTER_SPACE_RES(mp,nl)
75#define XFS_MKDIR_SPACE_RES(mp,nl) \
76 (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
77#define XFS_QM_DQALLOC_SPACE_RES(mp) \
78 (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
79 XFS_DQUOT_CLUSTER_SIZE_FSB)
80#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
81 XFS_IALLOC_SPACE_RES(mp)
82#define XFS_REMOVE_SPACE_RES(mp) \
83 XFS_DIRREMOVE_SPACE_RES(mp)
84#define XFS_RENAME_SPACE_RES(mp,nl) \
85 (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
86#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
87 (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
88#define XFS_IFREE_SPACE_RES(mp) \
89 (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
90
91
92#endif /* __XFS_TRANS_SPACE_H__ */