aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_iget.c
diff options
context:
space:
mode:
authorDavid Chinner <dgc@sgi.com>2007-08-28 00:00:13 -0400
committerTim Shimmin <tes@chook.melbourne.sgi.com>2007-10-15 02:50:50 -0400
commitda353b0d64e070ae7c5342a0d56ec20ae9ef5cfb (patch)
tree84454023d649df67cc6b125c73746ddb341ac34e /fs/xfs/xfs_iget.c
parent39cd9f877e63ce7e02cdc7f5dbf1b908451c9532 (diff)
[XFS] Radix tree based inode caching
One of the perpetual scaling problems XFS has is indexing it's incore inodes. We currently uses hashes and the default hash sizes chosen can only ever be a tradeoff between memory consumption and the maximum realistic size of the cache. As a result, anyone who has millions of inodes cached on a filesystem needs to tunes the size of the cache via the ihashsize mount option to allow decent scalability with inode cache operations. A further problem is the separate inode cluster hash, whose size is based on the ihashsize but is smaller, and so under certain conditions (sparse cluster cache population) this can become a limitation long before the inode hash is causing issues. The following patchset removes the inode hash and cluster hash and replaces them with radix trees to avoid the scalability limitations of the hashes. It also reduces the size of the inodes by 3 pointers.... SGI-PV: 969561 SGI-Modid: xfs-linux-melb:xfs-kern:29481a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_iget.c')
-rw-r--r--fs/xfs/xfs_iget.c585
1 files changed, 195 insertions, 390 deletions
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 114433a22baa..e07dcc1b70a6 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -40,131 +40,13 @@
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41 41
42/* 42/*
43 * Initialize the inode hash table for the newly mounted file system.
44 * Choose an initial table size based on user specified value, else
45 * use a simple algorithm using the maximum number of inodes as an
46 * indicator for table size, and clamp it between one and some large
47 * number of pages.
48 */
49void
50xfs_ihash_init(xfs_mount_t *mp)
51{
52 __uint64_t icount;
53 uint i;
54
55 if (!mp->m_ihsize) {
56 icount = mp->m_maxicount ? mp->m_maxicount :
57 (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog);
58 mp->m_ihsize = 1 << max_t(uint, 8,
59 (xfs_highbit64(icount) + 1) / 2);
60 mp->m_ihsize = min_t(uint, mp->m_ihsize,
61 (64 * NBPP) / sizeof(xfs_ihash_t));
62 }
63
64 mp->m_ihash = kmem_zalloc_greedy(&mp->m_ihsize,
65 NBPC * sizeof(xfs_ihash_t),
66 mp->m_ihsize * sizeof(xfs_ihash_t),
67 KM_SLEEP | KM_MAYFAIL | KM_LARGE);
68 mp->m_ihsize /= sizeof(xfs_ihash_t);
69 for (i = 0; i < mp->m_ihsize; i++)
70 rwlock_init(&(mp->m_ihash[i].ih_lock));
71}
72
73/*
74 * Free up structures allocated by xfs_ihash_init, at unmount time.
75 */
76void
77xfs_ihash_free(xfs_mount_t *mp)
78{
79 kmem_free(mp->m_ihash, mp->m_ihsize * sizeof(xfs_ihash_t));
80 mp->m_ihash = NULL;
81}
82
83/*
84 * Initialize the inode cluster hash table for the newly mounted file system.
85 * Its size is derived from the ihash table size.
86 */
87void
88xfs_chash_init(xfs_mount_t *mp)
89{
90 uint i;
91
92 mp->m_chsize = max_t(uint, 1, mp->m_ihsize /
93 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog));
94 mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
95 mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
96 * sizeof(xfs_chash_t),
97 KM_SLEEP | KM_LARGE);
98 for (i = 0; i < mp->m_chsize; i++) {
99 spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
100 }
101}
102
103/*
104 * Free up structures allocated by xfs_chash_init, at unmount time.
105 */
106void
107xfs_chash_free(xfs_mount_t *mp)
108{
109 int i;
110
111 for (i = 0; i < mp->m_chsize; i++) {
112 spinlock_destroy(&mp->m_chash[i].ch_lock);
113 }
114
115 kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t));
116 mp->m_chash = NULL;
117}
118
119/*
120 * Try to move an inode to the front of its hash list if possible
121 * (and if its not there already). Called right after obtaining
122 * the list version number and then dropping the read_lock on the
123 * hash list in question (which is done right after looking up the
124 * inode in question...).
125 */
126STATIC void
127xfs_ihash_promote(
128 xfs_ihash_t *ih,
129 xfs_inode_t *ip,
130 ulong version)
131{
132 xfs_inode_t *iq;
133
134 if ((ip->i_prevp != &ih->ih_next) && write_trylock(&ih->ih_lock)) {
135 if (likely(version == ih->ih_version)) {
136 /* remove from list */
137 if ((iq = ip->i_next)) {
138 iq->i_prevp = ip->i_prevp;
139 }
140 *ip->i_prevp = iq;
141
142 /* insert at list head */
143 iq = ih->ih_next;
144 iq->i_prevp = &ip->i_next;
145 ip->i_next = iq;
146 ip->i_prevp = &ih->ih_next;
147 ih->ih_next = ip;
148 }
149 write_unlock(&ih->ih_lock);
150 }
151}
152
153/*
154 * Look up an inode by number in the given file system. 43 * Look up an inode by number in the given file system.
155 * The inode is looked up in the hash table for the file system 44 * The inode is looked up in the cache held in each AG.
156 * represented by the mount point parameter mp. Each bucket of 45 * If the inode is found in the cache, attach it to the provided
157 * the hash table is guarded by an individual semaphore. 46 * vnode.
158 *
159 * If the inode is found in the hash table, its corresponding vnode
160 * is obtained with a call to vn_get(). This call takes care of
161 * coordination with the reclamation of the inode and vnode. Note
162 * that the vmap structure is filled in while holding the hash lock.
163 * This gives us the state of the inode/vnode when we found it and
164 * is used for coordination in vn_get().
165 * 47 *
166 * If it is not in core, read it in from the file system's device and 48 * If it is not in core, read it in from the file system's device,
167 * add the inode into the hash table. 49 * add it to the cache and attach the provided vnode.
168 * 50 *
169 * The inode is locked according to the value of the lock_flags parameter. 51 * The inode is locked according to the value of the lock_flags parameter.
170 * This flag parameter indicates how and if the inode's IO lock and inode lock 52 * This flag parameter indicates how and if the inode's IO lock and inode lock
@@ -192,274 +74,241 @@ xfs_iget_core(
192 xfs_inode_t **ipp, 74 xfs_inode_t **ipp,
193 xfs_daddr_t bno) 75 xfs_daddr_t bno)
194{ 76{
195 xfs_ihash_t *ih;
196 xfs_inode_t *ip; 77 xfs_inode_t *ip;
197 xfs_inode_t *iq; 78 xfs_inode_t *iq;
198 bhv_vnode_t *inode_vp; 79 bhv_vnode_t *inode_vp;
199 ulong version;
200 int error; 80 int error;
201 /* REFERENCED */ 81 xfs_icluster_t *icl, *new_icl = NULL;
202 xfs_chash_t *ch; 82 unsigned long first_index, mask;
203 xfs_chashlist_t *chl, *chlnew; 83 xfs_perag_t *pag;
204 SPLDECL(s); 84 xfs_agino_t agino;
85
86 /* the radix tree exists only in inode capable AGs */
87 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
88 return EINVAL;
89
90 /* get the perag structure and ensure that it's inode capable */
91 pag = xfs_get_perag(mp, ino);
92 if (!pag->pagi_inodeok)
93 return EINVAL;
94 ASSERT(pag->pag_ici_init);
95 agino = XFS_INO_TO_AGINO(mp, ino);
205 96
97again:
98 read_lock(&pag->pag_ici_lock);
99 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
206 100
207 ih = XFS_IHASH(mp, ino); 101 if (ip != NULL) {
102 /*
103 * If INEW is set this inode is being set up
104 * we need to pause and try again.
105 */
106 if (xfs_iflags_test(ip, XFS_INEW)) {
107 read_unlock(&pag->pag_ici_lock);
108 delay(1);
109 XFS_STATS_INC(xs_ig_frecycle);
208 110
209again: 111 goto again;
210 read_lock(&ih->ih_lock); 112 }
211 113
212 for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { 114 inode_vp = XFS_ITOV_NULL(ip);
213 if (ip->i_ino == ino) { 115 if (inode_vp == NULL) {
214 /* 116 /*
215 * If INEW is set this inode is being set up 117 * If IRECLAIM is set this inode is
118 * on its way out of the system,
216 * we need to pause and try again. 119 * we need to pause and try again.
217 */ 120 */
218 if (xfs_iflags_test(ip, XFS_INEW)) { 121 if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
219 read_unlock(&ih->ih_lock); 122 read_unlock(&pag->pag_ici_lock);
220 delay(1); 123 delay(1);
221 XFS_STATS_INC(xs_ig_frecycle); 124 XFS_STATS_INC(xs_ig_frecycle);
222 125
223 goto again; 126 goto again;
224 } 127 }
128 ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
225 129
226 inode_vp = XFS_ITOV_NULL(ip); 130 /*
227 if (inode_vp == NULL) { 131 * If lookup is racing with unlink, then we
228 /* 132 * should return an error immediately so we
229 * If IRECLAIM is set this inode is 133 * don't remove it from the reclaim list and
230 * on its way out of the system, 134 * potentially leak the inode.
231 * we need to pause and try again. 135 */
232 */ 136 if ((ip->i_d.di_mode == 0) &&
233 if (xfs_iflags_test(ip, XFS_IRECLAIM)) { 137 !(flags & XFS_IGET_CREATE)) {
234 read_unlock(&ih->ih_lock); 138 read_unlock(&pag->pag_ici_lock);
235 delay(1); 139 xfs_put_perag(mp, pag);
236 XFS_STATS_INC(xs_ig_frecycle); 140 return ENOENT;
237 141 }
238 goto again;
239 }
240 ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
241
242 /*
243 * If lookup is racing with unlink, then we
244 * should return an error immediately so we
245 * don't remove it from the reclaim list and
246 * potentially leak the inode.
247 */
248 if ((ip->i_d.di_mode == 0) &&
249 !(flags & XFS_IGET_CREATE)) {
250 read_unlock(&ih->ih_lock);
251 return ENOENT;
252 }
253
254 /*
255 * There may be transactions sitting in the
256 * incore log buffers or being flushed to disk
257 * at this time. We can't clear the
258 * XFS_IRECLAIMABLE flag until these
259 * transactions have hit the disk, otherwise we
260 * will void the guarantee the flag provides
261 * xfs_iunpin()
262 */
263 if (xfs_ipincount(ip)) {
264 read_unlock(&ih->ih_lock);
265 xfs_log_force(mp, 0,
266 XFS_LOG_FORCE|XFS_LOG_SYNC);
267 XFS_STATS_INC(xs_ig_frecycle);
268 goto again;
269 }
270
271 vn_trace_exit(vp, "xfs_iget.alloc",
272 (inst_t *)__return_address);
273 142
274 XFS_STATS_INC(xs_ig_found); 143 /*
144 * There may be transactions sitting in the
145 * incore log buffers or being flushed to disk
146 * at this time. We can't clear the
147 * XFS_IRECLAIMABLE flag until these
148 * transactions have hit the disk, otherwise we
149 * will void the guarantee the flag provides
150 * xfs_iunpin()
151 */
152 if (xfs_ipincount(ip)) {
153 read_unlock(&pag->pag_ici_lock);
154 xfs_log_force(mp, 0,
155 XFS_LOG_FORCE|XFS_LOG_SYNC);
156 XFS_STATS_INC(xs_ig_frecycle);
157 goto again;
158 }
275 159
276 xfs_iflags_clear(ip, XFS_IRECLAIMABLE); 160 vn_trace_exit(vp, "xfs_iget.alloc",
277 version = ih->ih_version; 161 (inst_t *)__return_address);
278 read_unlock(&ih->ih_lock);
279 xfs_ihash_promote(ih, ip, version);
280 162
281 XFS_MOUNT_ILOCK(mp); 163 XFS_STATS_INC(xs_ig_found);
282 list_del_init(&ip->i_reclaim);
283 XFS_MOUNT_IUNLOCK(mp);
284 164
285 goto finish_inode; 165 xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
166 read_unlock(&pag->pag_ici_lock);
286 167
287 } else if (vp != inode_vp) { 168 XFS_MOUNT_ILOCK(mp);
288 struct inode *inode = vn_to_inode(inode_vp); 169 list_del_init(&ip->i_reclaim);
170 XFS_MOUNT_IUNLOCK(mp);
289 171
290 /* The inode is being torn down, pause and 172 goto finish_inode;
291 * try again.
292 */
293 if (inode->i_state & (I_FREEING | I_CLEAR)) {
294 read_unlock(&ih->ih_lock);
295 delay(1);
296 XFS_STATS_INC(xs_ig_frecycle);
297 173
298 goto again; 174 } else if (vp != inode_vp) {
299 } 175 struct inode *inode = vn_to_inode(inode_vp);
300/* Chances are the other vnode (the one in the inode) is being torn
301 * down right now, and we landed on top of it. Question is, what do
302 * we do? Unhook the old inode and hook up the new one?
303 */
304 cmn_err(CE_PANIC,
305 "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
306 inode_vp, vp);
307 }
308 176
309 /* 177 /* The inode is being torn down, pause and
310 * Inode cache hit: if ip is not at the front of 178 * try again.
311 * its hash chain, move it there now.
312 * Do this with the lock held for update, but
313 * do statistics after releasing the lock.
314 */ 179 */
315 version = ih->ih_version; 180 if (inode->i_state & (I_FREEING | I_CLEAR)) {
316 read_unlock(&ih->ih_lock); 181 read_unlock(&pag->pag_ici_lock);
317 xfs_ihash_promote(ih, ip, version); 182 delay(1);
318 XFS_STATS_INC(xs_ig_found); 183 XFS_STATS_INC(xs_ig_frecycle);
319 184
320finish_inode: 185 goto again;
321 if (ip->i_d.di_mode == 0) {
322 if (!(flags & XFS_IGET_CREATE))
323 return ENOENT;
324 xfs_iocore_inode_reinit(ip);
325 } 186 }
187/* Chances are the other vnode (the one in the inode) is being torn
188* down right now, and we landed on top of it. Question is, what do
189* we do? Unhook the old inode and hook up the new one?
190*/
191 cmn_err(CE_PANIC,
192 "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
193 inode_vp, vp);
194 }
326 195
327 if (lock_flags != 0) 196 /*
328 xfs_ilock(ip, lock_flags); 197 * Inode cache hit
198 */
199 read_unlock(&pag->pag_ici_lock);
200 XFS_STATS_INC(xs_ig_found);
329 201
330 xfs_iflags_clear(ip, XFS_ISTALE); 202finish_inode:
331 vn_trace_exit(vp, "xfs_iget.found", 203 if (ip->i_d.di_mode == 0) {
332 (inst_t *)__return_address); 204 if (!(flags & XFS_IGET_CREATE)) {
333 goto return_ip; 205 xfs_put_perag(mp, pag);
206 return ENOENT;
207 }
208 xfs_iocore_inode_reinit(ip);
334 } 209 }
210
211 if (lock_flags != 0)
212 xfs_ilock(ip, lock_flags);
213
214 xfs_iflags_clear(ip, XFS_ISTALE);
215 vn_trace_exit(vp, "xfs_iget.found",
216 (inst_t *)__return_address);
217 goto return_ip;
335 } 218 }
336 219
337 /* 220 /*
338 * Inode cache miss: save the hash chain version stamp and unlock 221 * Inode cache miss
339 * the chain, so we don't deadlock in vn_alloc.
340 */ 222 */
223 read_unlock(&pag->pag_ici_lock);
341 XFS_STATS_INC(xs_ig_missed); 224 XFS_STATS_INC(xs_ig_missed);
342 225
343 version = ih->ih_version;
344
345 read_unlock(&ih->ih_lock);
346
347 /* 226 /*
348 * Read the disk inode attributes into a new inode structure and get 227 * Read the disk inode attributes into a new inode structure and get
349 * a new vnode for it. This should also initialize i_ino and i_mount. 228 * a new vnode for it. This should also initialize i_ino and i_mount.
350 */ 229 */
351 error = xfs_iread(mp, tp, ino, &ip, bno, 230 error = xfs_iread(mp, tp, ino, &ip, bno,
352 (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0); 231 (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
353 if (error) 232 if (error) {
233 xfs_put_perag(mp, pag);
354 return error; 234 return error;
235 }
355 236
356 vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address); 237 vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
357 238
358 xfs_inode_lock_init(ip, vp); 239 xfs_inode_lock_init(ip, vp);
359 xfs_iocore_inode_init(ip); 240 xfs_iocore_inode_init(ip);
360
361 if (lock_flags) 241 if (lock_flags)
362 xfs_ilock(ip, lock_flags); 242 xfs_ilock(ip, lock_flags);
363 243
364 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { 244 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
365 xfs_idestroy(ip); 245 xfs_idestroy(ip);
246 xfs_put_perag(mp, pag);
366 return ENOENT; 247 return ENOENT;
367 } 248 }
368 249
369 /* 250 /*
370 * Put ip on its hash chain, unless someone else hashed a duplicate 251 * This is a bit messy - we preallocate everything we _might_
371 * after we released the hash lock. 252 * need before we pick up the ici lock. That way we don't have to
253 * juggle locks and go all the way back to the start.
372 */ 254 */
373 write_lock(&ih->ih_lock); 255 new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
256 if (radix_tree_preload(GFP_KERNEL)) {
257 delay(1);
258 goto again;
259 }
260 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
261 first_index = agino & mask;
262 write_lock(&pag->pag_ici_lock);
374 263
375 if (ih->ih_version != version) { 264 /*
376 for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) { 265 * Find the cluster if it exists
377 if (iq->i_ino == ino) { 266 */
378 write_unlock(&ih->ih_lock); 267 icl = NULL;
379 xfs_idestroy(ip); 268 if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
269 first_index, 1)) {
270 if ((iq->i_ino & mask) == first_index)
271 icl = iq->i_cluster;
272 }
380 273
381 XFS_STATS_INC(xs_ig_dup); 274 /*
382 goto again; 275 * insert the new inode
383 } 276 */
384 } 277 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
278 if (unlikely(error)) {
279 BUG_ON(error != -EEXIST);
280 write_unlock(&pag->pag_ici_lock);
281 radix_tree_preload_end();
282 xfs_idestroy(ip);
283 XFS_STATS_INC(xs_ig_dup);
284 goto again;
385 } 285 }
386 286
387 /* 287 /*
388 * These values _must_ be set before releasing ihlock! 288 * These values _must_ be set before releasing ihlock!
389 */ 289 */
390 ip->i_hash = ih;
391 if ((iq = ih->ih_next)) {
392 iq->i_prevp = &ip->i_next;
393 }
394 ip->i_next = iq;
395 ip->i_prevp = &ih->ih_next;
396 ih->ih_next = ip;
397 ip->i_udquot = ip->i_gdquot = NULL; 290 ip->i_udquot = ip->i_gdquot = NULL;
398 ih->ih_version++;
399 xfs_iflags_set(ip, XFS_INEW); 291 xfs_iflags_set(ip, XFS_INEW);
400 write_unlock(&ih->ih_lock);
401 292
402 /* 293 ASSERT(ip->i_cluster == NULL);
403 * put ip on its cluster's hash chain
404 */
405 ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
406 ip->i_cnext == NULL);
407
408 chlnew = NULL;
409 ch = XFS_CHASH(mp, ip->i_blkno);
410 chlredo:
411 s = mutex_spinlock(&ch->ch_lock);
412 for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
413 if (chl->chl_blkno == ip->i_blkno) {
414
415 /* insert this inode into the doubly-linked list
416 * where chl points */
417 if ((iq = chl->chl_ip)) {
418 ip->i_cprev = iq->i_cprev;
419 iq->i_cprev->i_cnext = ip;
420 iq->i_cprev = ip;
421 ip->i_cnext = iq;
422 } else {
423 ip->i_cnext = ip;
424 ip->i_cprev = ip;
425 }
426 chl->chl_ip = ip;
427 ip->i_chash = chl;
428 break;
429 }
430 }
431 294
432 /* no hash list found for this block; add a new hash list */ 295 if (!icl) {
433 if (chl == NULL) { 296 spin_lock_init(&new_icl->icl_lock);
434 if (chlnew == NULL) { 297 INIT_HLIST_HEAD(&new_icl->icl_inodes);
435 mutex_spinunlock(&ch->ch_lock, s); 298 icl = new_icl;
436 ASSERT(xfs_chashlist_zone != NULL); 299 new_icl = NULL;
437 chlnew = (xfs_chashlist_t *)
438 kmem_zone_alloc(xfs_chashlist_zone,
439 KM_SLEEP);
440 ASSERT(chlnew != NULL);
441 goto chlredo;
442 } else {
443 ip->i_cnext = ip;
444 ip->i_cprev = ip;
445 ip->i_chash = chlnew;
446 chlnew->chl_ip = ip;
447 chlnew->chl_blkno = ip->i_blkno;
448 if (ch->ch_list)
449 ch->ch_list->chl_prev = chlnew;
450 chlnew->chl_next = ch->ch_list;
451 chlnew->chl_prev = NULL;
452 ch->ch_list = chlnew;
453 chlnew = NULL;
454 }
455 } else { 300 } else {
456 if (chlnew != NULL) { 301 ASSERT(!hlist_empty(&icl->icl_inodes));
457 kmem_zone_free(xfs_chashlist_zone, chlnew);
458 }
459 } 302 }
303 spin_lock(&icl->icl_lock);
304 hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
305 ip->i_cluster = icl;
306 spin_unlock(&icl->icl_lock);
460 307
461 mutex_spinunlock(&ch->ch_lock, s); 308 write_unlock(&pag->pag_ici_lock);
462 309 radix_tree_preload_end();
310 if (new_icl)
311 kmem_zone_free(xfs_icluster_zone, new_icl);
463 312
464 /* 313 /*
465 * Link ip to its mount and thread it on the mount's inode list. 314 * Link ip to its mount and thread it on the mount's inode list.
@@ -478,6 +327,7 @@ finish_inode:
478 mp->m_inodes = ip; 327 mp->m_inodes = ip;
479 328
480 XFS_MOUNT_IUNLOCK(mp); 329 XFS_MOUNT_IUNLOCK(mp);
330 xfs_put_perag(mp, pag);
481 331
482 return_ip: 332 return_ip:
483 ASSERT(ip->i_df.if_ext_max == 333 ASSERT(ip->i_df.if_ext_max ==
@@ -587,32 +437,19 @@ xfs_inode_incore(xfs_mount_t *mp,
587 xfs_ino_t ino, 437 xfs_ino_t ino,
588 xfs_trans_t *tp) 438 xfs_trans_t *tp)
589{ 439{
590 xfs_ihash_t *ih;
591 xfs_inode_t *ip; 440 xfs_inode_t *ip;
592 ulong version; 441 xfs_perag_t *pag;
593 442
594 ih = XFS_IHASH(mp, ino); 443 pag = xfs_get_perag(mp, ino);
595 read_lock(&ih->ih_lock); 444 read_lock(&pag->pag_ici_lock);
596 for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { 445 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
597 if (ip->i_ino == ino) { 446 read_unlock(&pag->pag_ici_lock);
598 /* 447 xfs_put_perag(mp, pag);
599 * If we find it and tp matches, return it. 448
600 * Also move it to the front of the hash list 449 /* the returned inode must match the transaction */
601 * if we find it and it is not already there. 450 if (ip && (ip->i_transp != tp))
602 * Otherwise break from the loop and return 451 return NULL;
603 * NULL. 452 return ip;
604 */
605 if (ip->i_transp == tp) {
606 version = ih->ih_version;
607 read_unlock(&ih->ih_lock);
608 xfs_ihash_promote(ih, ip, version);
609 return (ip);
610 }
611 break;
612 }
613 }
614 read_unlock(&ih->ih_lock);
615 return (NULL);
616} 453}
617 454
618/* 455/*
@@ -718,58 +555,26 @@ void
718xfs_iextract( 555xfs_iextract(
719 xfs_inode_t *ip) 556 xfs_inode_t *ip)
720{ 557{
721 xfs_ihash_t *ih; 558 xfs_mount_t *mp = ip->i_mount;
559 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
722 xfs_inode_t *iq; 560 xfs_inode_t *iq;
723 xfs_mount_t *mp; 561
724 xfs_chash_t *ch; 562 write_lock(&pag->pag_ici_lock);
725 xfs_chashlist_t *chl, *chm; 563 radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
726 SPLDECL(s); 564 write_unlock(&pag->pag_ici_lock);
727 565 xfs_put_perag(mp, pag);
728 ih = ip->i_hash;
729 write_lock(&ih->ih_lock);
730 if ((iq = ip->i_next)) {
731 iq->i_prevp = ip->i_prevp;
732 }
733 *ip->i_prevp = iq;
734 ih->ih_version++;
735 write_unlock(&ih->ih_lock);
736 566
737 /* 567 /*
738 * Remove from cluster hash list 568 * Remove from cluster list
739 * 1) delete the chashlist if this is the last inode on the chashlist
740 * 2) unchain from list of inodes
741 * 3) point chashlist->chl_ip to 'chl_next' if to this inode.
742 */ 569 */
743 mp = ip->i_mount; 570 mp = ip->i_mount;
744 ch = XFS_CHASH(mp, ip->i_blkno); 571 spin_lock(&ip->i_cluster->icl_lock);
745 s = mutex_spinlock(&ch->ch_lock); 572 hlist_del(&ip->i_cnode);
746 573 spin_unlock(&ip->i_cluster->icl_lock);
747 if (ip->i_cnext == ip) { 574
748 /* Last inode on chashlist */ 575 /* was last inode in cluster? */
749 ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); 576 if (hlist_empty(&ip->i_cluster->icl_inodes))
750 ASSERT(ip->i_chash != NULL); 577 kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
751 chm=NULL;
752 chl = ip->i_chash;
753 if (chl->chl_prev)
754 chl->chl_prev->chl_next = chl->chl_next;
755 else
756 ch->ch_list = chl->chl_next;
757 if (chl->chl_next)
758 chl->chl_next->chl_prev = chl->chl_prev;
759 kmem_zone_free(xfs_chashlist_zone, chl);
760 } else {
761 /* delete one inode from a non-empty list */
762 iq = ip->i_cnext;
763 iq->i_cprev = ip->i_cprev;
764 ip->i_cprev->i_cnext = iq;
765 if (ip->i_chash->chl_ip == ip) {
766 ip->i_chash->chl_ip = iq;
767 }
768 ip->i_chash = __return_address;
769 ip->i_cprev = __return_address;
770 ip->i_cnext = __return_address;
771 }
772 mutex_spinunlock(&ch->ch_lock, s);
773 578
774 /* 579 /*
775 * Remove from mount's inode list. 580 * Remove from mount's inode list.