aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_fs_subr.c96
-rw-r--r--fs/xfs/xfs_iget.c720
-rw-r--r--fs/xfs/xfs_qm_stats.c105
-rw-r--r--fs/xfs/xfs_qm_stats.h53
-rw-r--r--fs/xfs/xfs_rw.c175
-rw-r--r--fs/xfs/xfs_rw.h49
-rw-r--r--fs/xfs/xfs_sync.c1065
-rw-r--r--fs/xfs/xfs_sync.h51
8 files changed, 2314 insertions, 0 deletions
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
new file mode 100644
index 00000000000..ed88ed16811
--- /dev/null
+++ b/fs/xfs/xfs_fs_subr.c
@@ -0,0 +1,96 @@
1/*
2 * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_vnodeops.h"
20#include "xfs_bmap_btree.h"
21#include "xfs_inode.h"
22#include "xfs_trace.h"
23
24/*
25 * note: all filemap functions return negative error codes. These
26 * need to be inverted before returning to the xfs core functions.
27 */
28void
29xfs_tosspages(
30 xfs_inode_t *ip,
31 xfs_off_t first,
32 xfs_off_t last,
33 int fiopt)
34{
35 /* can't toss partial tail pages, so mask them out */
36 last &= ~(PAGE_SIZE - 1);
37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38}
39
40int
41xfs_flushinval_pages(
42 xfs_inode_t *ip,
43 xfs_off_t first,
44 xfs_off_t last,
45 int fiopt)
46{
47 struct address_space *mapping = VFS_I(ip)->i_mapping;
48 int ret = 0;
49
50 trace_xfs_pagecache_inval(ip, first, last);
51
52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
53 ret = filemap_write_and_wait_range(mapping, first,
54 last == -1 ? LLONG_MAX : last);
55 if (!ret)
56 truncate_inode_pages_range(mapping, first, last);
57 return -ret;
58}
59
60int
61xfs_flush_pages(
62 xfs_inode_t *ip,
63 xfs_off_t first,
64 xfs_off_t last,
65 uint64_t flags,
66 int fiopt)
67{
68 struct address_space *mapping = VFS_I(ip)->i_mapping;
69 int ret = 0;
70 int ret2;
71
72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
73 ret = -filemap_fdatawrite_range(mapping, first,
74 last == -1 ? LLONG_MAX : last);
75 if (flags & XBF_ASYNC)
76 return ret;
77 ret2 = xfs_wait_on_pages(ip, first, last);
78 if (!ret)
79 ret = ret2;
80 return ret;
81}
82
83int
84xfs_wait_on_pages(
85 xfs_inode_t *ip,
86 xfs_off_t first,
87 xfs_off_t last)
88{
89 struct address_space *mapping = VFS_I(ip)->i_mapping;
90
91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last);
94 }
95 return 0;
96}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
new file mode 100644
index 00000000000..7759812c1bb
--- /dev/null
+++ b/fs/xfs/xfs_iget.c
@@ -0,0 +1,720 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_acl.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h"
32#include "xfs_dinode.h"
33#include "xfs_inode.h"
34#include "xfs_btree.h"
35#include "xfs_ialloc.h"
36#include "xfs_quota.h"
37#include "xfs_utils.h"
38#include "xfs_trans_priv.h"
39#include "xfs_inode_item.h"
40#include "xfs_bmap.h"
41#include "xfs_trace.h"
42
43
44/*
45 * Define xfs inode iolock lockdep classes. We need to ensure that all active
46 * inodes are considered the same for lockdep purposes, including inodes that
47 * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
48 * guarantee the locks are considered the same when there are multiple lock
49 * initialisation siteѕ. Also, define a reclaimable inode class so it is
50 * obvious in lockdep reports which class the report is against.
51 */
52static struct lock_class_key xfs_iolock_active;
53struct lock_class_key xfs_iolock_reclaimable;
54
55/*
56 * Allocate and initialise an xfs_inode.
57 */
58STATIC struct xfs_inode *
59xfs_inode_alloc(
60 struct xfs_mount *mp,
61 xfs_ino_t ino)
62{
63 struct xfs_inode *ip;
64
65 /*
66 * if this didn't occur in transactions, we could use
67 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
68 * code up to do this anyway.
69 */
70 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
71 if (!ip)
72 return NULL;
73 if (inode_init_always(mp->m_super, VFS_I(ip))) {
74 kmem_zone_free(xfs_inode_zone, ip);
75 return NULL;
76 }
77
78 ASSERT(atomic_read(&ip->i_iocount) == 0);
79 ASSERT(atomic_read(&ip->i_pincount) == 0);
80 ASSERT(!spin_is_locked(&ip->i_flags_lock));
81 ASSERT(completion_done(&ip->i_flush));
82 ASSERT(ip->i_ino == 0);
83
84 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
85 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
86 &xfs_iolock_active, "xfs_iolock_active");
87
88 /* initialise the xfs inode */
89 ip->i_ino = ino;
90 ip->i_mount = mp;
91 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
92 ip->i_afp = NULL;
93 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
94 ip->i_flags = 0;
95 ip->i_update_core = 0;
96 ip->i_delayed_blks = 0;
97 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
98 ip->i_size = 0;
99 ip->i_new_size = 0;
100
101 return ip;
102}
103
104STATIC void
105xfs_inode_free_callback(
106 struct rcu_head *head)
107{
108 struct inode *inode = container_of(head, struct inode, i_rcu);
109 struct xfs_inode *ip = XFS_I(inode);
110
111 INIT_LIST_HEAD(&inode->i_dentry);
112 kmem_zone_free(xfs_inode_zone, ip);
113}
114
115void
116xfs_inode_free(
117 struct xfs_inode *ip)
118{
119 switch (ip->i_d.di_mode & S_IFMT) {
120 case S_IFREG:
121 case S_IFDIR:
122 case S_IFLNK:
123 xfs_idestroy_fork(ip, XFS_DATA_FORK);
124 break;
125 }
126
127 if (ip->i_afp)
128 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
129
130 if (ip->i_itemp) {
131 /*
132 * Only if we are shutting down the fs will we see an
133 * inode still in the AIL. If it is there, we should remove
134 * it to prevent a use-after-free from occurring.
135 */
136 xfs_log_item_t *lip = &ip->i_itemp->ili_item;
137 struct xfs_ail *ailp = lip->li_ailp;
138
139 ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
140 XFS_FORCED_SHUTDOWN(ip->i_mount));
141 if (lip->li_flags & XFS_LI_IN_AIL) {
142 spin_lock(&ailp->xa_lock);
143 if (lip->li_flags & XFS_LI_IN_AIL)
144 xfs_trans_ail_delete(ailp, lip);
145 else
146 spin_unlock(&ailp->xa_lock);
147 }
148 xfs_inode_item_destroy(ip);
149 ip->i_itemp = NULL;
150 }
151
152 /* asserts to verify all state is correct here */
153 ASSERT(atomic_read(&ip->i_iocount) == 0);
154 ASSERT(atomic_read(&ip->i_pincount) == 0);
155 ASSERT(!spin_is_locked(&ip->i_flags_lock));
156 ASSERT(completion_done(&ip->i_flush));
157
158 /*
159 * Because we use RCU freeing we need to ensure the inode always
160 * appears to be reclaimed with an invalid inode number when in the
161 * free state. The ip->i_flags_lock provides the barrier against lookup
162 * races.
163 */
164 spin_lock(&ip->i_flags_lock);
165 ip->i_flags = XFS_IRECLAIM;
166 ip->i_ino = 0;
167 spin_unlock(&ip->i_flags_lock);
168
169 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
170}
171
172/*
173 * Check the validity of the inode we just found it the cache
174 */
175static int
176xfs_iget_cache_hit(
177 struct xfs_perag *pag,
178 struct xfs_inode *ip,
179 xfs_ino_t ino,
180 int flags,
181 int lock_flags) __releases(RCU)
182{
183 struct inode *inode = VFS_I(ip);
184 struct xfs_mount *mp = ip->i_mount;
185 int error;
186
187 /*
188 * check for re-use of an inode within an RCU grace period due to the
189 * radix tree nodes not being updated yet. We monitor for this by
190 * setting the inode number to zero before freeing the inode structure.
191 * If the inode has been reallocated and set up, then the inode number
192 * will not match, so check for that, too.
193 */
194 spin_lock(&ip->i_flags_lock);
195 if (ip->i_ino != ino) {
196 trace_xfs_iget_skip(ip);
197 XFS_STATS_INC(xs_ig_frecycle);
198 error = EAGAIN;
199 goto out_error;
200 }
201
202
203 /*
204 * If we are racing with another cache hit that is currently
205 * instantiating this inode or currently recycling it out of
206 * reclaimabe state, wait for the initialisation to complete
207 * before continuing.
208 *
209 * XXX(hch): eventually we should do something equivalent to
210 * wait_on_inode to wait for these flags to be cleared
211 * instead of polling for it.
212 */
213 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
214 trace_xfs_iget_skip(ip);
215 XFS_STATS_INC(xs_ig_frecycle);
216 error = EAGAIN;
217 goto out_error;
218 }
219
220 /*
221 * If lookup is racing with unlink return an error immediately.
222 */
223 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
224 error = ENOENT;
225 goto out_error;
226 }
227
228 /*
229 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
230 * Need to carefully get it back into useable state.
231 */
232 if (ip->i_flags & XFS_IRECLAIMABLE) {
233 trace_xfs_iget_reclaim(ip);
234
235 /*
236 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
237 * from stomping over us while we recycle the inode. We can't
238 * clear the radix tree reclaimable tag yet as it requires
239 * pag_ici_lock to be held exclusive.
240 */
241 ip->i_flags |= XFS_IRECLAIM;
242
243 spin_unlock(&ip->i_flags_lock);
244 rcu_read_unlock();
245
246 error = -inode_init_always(mp->m_super, inode);
247 if (error) {
248 /*
249 * Re-initializing the inode failed, and we are in deep
250 * trouble. Try to re-add it to the reclaim list.
251 */
252 rcu_read_lock();
253 spin_lock(&ip->i_flags_lock);
254
255 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
256 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
257 trace_xfs_iget_reclaim_fail(ip);
258 goto out_error;
259 }
260
261 spin_lock(&pag->pag_ici_lock);
262 spin_lock(&ip->i_flags_lock);
263
264 /*
265 * Clear the per-lifetime state in the inode as we are now
266 * effectively a new inode and need to return to the initial
267 * state before reuse occurs.
268 */
269 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
270 ip->i_flags |= XFS_INEW;
271 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
272 inode->i_state = I_NEW;
273
274 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
275 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
276 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
277 &xfs_iolock_active, "xfs_iolock_active");
278
279 spin_unlock(&ip->i_flags_lock);
280 spin_unlock(&pag->pag_ici_lock);
281 } else {
282 /* If the VFS inode is being torn down, pause and try again. */
283 if (!igrab(inode)) {
284 trace_xfs_iget_skip(ip);
285 error = EAGAIN;
286 goto out_error;
287 }
288
289 /* We've got a live one. */
290 spin_unlock(&ip->i_flags_lock);
291 rcu_read_unlock();
292 trace_xfs_iget_hit(ip);
293 }
294
295 if (lock_flags != 0)
296 xfs_ilock(ip, lock_flags);
297
298 xfs_iflags_clear(ip, XFS_ISTALE);
299 XFS_STATS_INC(xs_ig_found);
300
301 return 0;
302
303out_error:
304 spin_unlock(&ip->i_flags_lock);
305 rcu_read_unlock();
306 return error;
307}
308
309
310static int
311xfs_iget_cache_miss(
312 struct xfs_mount *mp,
313 struct xfs_perag *pag,
314 xfs_trans_t *tp,
315 xfs_ino_t ino,
316 struct xfs_inode **ipp,
317 int flags,
318 int lock_flags)
319{
320 struct xfs_inode *ip;
321 int error;
322 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
323
324 ip = xfs_inode_alloc(mp, ino);
325 if (!ip)
326 return ENOMEM;
327
328 error = xfs_iread(mp, tp, ip, flags);
329 if (error)
330 goto out_destroy;
331
332 trace_xfs_iget_miss(ip);
333
334 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
335 error = ENOENT;
336 goto out_destroy;
337 }
338
339 /*
340 * Preload the radix tree so we can insert safely under the
341 * write spinlock. Note that we cannot sleep inside the preload
342 * region.
343 */
344 if (radix_tree_preload(GFP_KERNEL)) {
345 error = EAGAIN;
346 goto out_destroy;
347 }
348
349 /*
350 * Because the inode hasn't been added to the radix-tree yet it can't
351 * be found by another thread, so we can do the non-sleeping lock here.
352 */
353 if (lock_flags) {
354 if (!xfs_ilock_nowait(ip, lock_flags))
355 BUG();
356 }
357
358 spin_lock(&pag->pag_ici_lock);
359
360 /* insert the new inode */
361 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
362 if (unlikely(error)) {
363 WARN_ON(error != -EEXIST);
364 XFS_STATS_INC(xs_ig_dup);
365 error = EAGAIN;
366 goto out_preload_end;
367 }
368
369 /* These values _must_ be set before releasing the radix tree lock! */
370 ip->i_udquot = ip->i_gdquot = NULL;
371 xfs_iflags_set(ip, XFS_INEW);
372
373 spin_unlock(&pag->pag_ici_lock);
374 radix_tree_preload_end();
375
376 *ipp = ip;
377 return 0;
378
379out_preload_end:
380 spin_unlock(&pag->pag_ici_lock);
381 radix_tree_preload_end();
382 if (lock_flags)
383 xfs_iunlock(ip, lock_flags);
384out_destroy:
385 __destroy_inode(VFS_I(ip));
386 xfs_inode_free(ip);
387 return error;
388}
389
390/*
391 * Look up an inode by number in the given file system.
392 * The inode is looked up in the cache held in each AG.
393 * If the inode is found in the cache, initialise the vfs inode
394 * if necessary.
395 *
396 * If it is not in core, read it in from the file system's device,
397 * add it to the cache and initialise the vfs inode.
398 *
399 * The inode is locked according to the value of the lock_flags parameter.
400 * This flag parameter indicates how and if the inode's IO lock and inode lock
401 * should be taken.
402 *
403 * mp -- the mount point structure for the current file system. It points
404 * to the inode hash table.
405 * tp -- a pointer to the current transaction if there is one. This is
406 * simply passed through to the xfs_iread() call.
407 * ino -- the number of the inode desired. This is the unique identifier
408 * within the file system for the inode being requested.
409 * lock_flags -- flags indicating how to lock the inode. See the comment
410 * for xfs_ilock() for a list of valid values.
411 */
412int
413xfs_iget(
414 xfs_mount_t *mp,
415 xfs_trans_t *tp,
416 xfs_ino_t ino,
417 uint flags,
418 uint lock_flags,
419 xfs_inode_t **ipp)
420{
421 xfs_inode_t *ip;
422 int error;
423 xfs_perag_t *pag;
424 xfs_agino_t agino;
425
426 /* reject inode numbers outside existing AGs */
427 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
428 return EINVAL;
429
430 /* get the perag structure and ensure that it's inode capable */
431 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
432 agino = XFS_INO_TO_AGINO(mp, ino);
433
434again:
435 error = 0;
436 rcu_read_lock();
437 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
438
439 if (ip) {
440 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
441 if (error)
442 goto out_error_or_again;
443 } else {
444 rcu_read_unlock();
445 XFS_STATS_INC(xs_ig_missed);
446
447 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
448 flags, lock_flags);
449 if (error)
450 goto out_error_or_again;
451 }
452 xfs_perag_put(pag);
453
454 *ipp = ip;
455
456 ASSERT(ip->i_df.if_ext_max ==
457 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
458 /*
459 * If we have a real type for an on-disk inode, we can set ops(&unlock)
460 * now. If it's a new inode being created, xfs_ialloc will handle it.
461 */
462 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
463 xfs_setup_inode(ip);
464 return 0;
465
466out_error_or_again:
467 if (error == EAGAIN) {
468 delay(1);
469 goto again;
470 }
471 xfs_perag_put(pag);
472 return error;
473}
474
475/*
476 * This is a wrapper routine around the xfs_ilock() routine
477 * used to centralize some grungy code. It is used in places
478 * that wish to lock the inode solely for reading the extents.
479 * The reason these places can't just call xfs_ilock(SHARED)
480 * is that the inode lock also guards to bringing in of the
481 * extents from disk for a file in b-tree format. If the inode
482 * is in b-tree format, then we need to lock the inode exclusively
483 * until the extents are read in. Locking it exclusively all
484 * the time would limit our parallelism unnecessarily, though.
485 * What we do instead is check to see if the extents have been
486 * read in yet, and only lock the inode exclusively if they
487 * have not.
488 *
489 * The function returns a value which should be given to the
490 * corresponding xfs_iunlock_map_shared(). This value is
491 * the mode in which the lock was actually taken.
492 */
493uint
494xfs_ilock_map_shared(
495 xfs_inode_t *ip)
496{
497 uint lock_mode;
498
499 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
500 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
501 lock_mode = XFS_ILOCK_EXCL;
502 } else {
503 lock_mode = XFS_ILOCK_SHARED;
504 }
505
506 xfs_ilock(ip, lock_mode);
507
508 return lock_mode;
509}
510
511/*
512 * This is simply the unlock routine to go with xfs_ilock_map_shared().
513 * All it does is call xfs_iunlock() with the given lock_mode.
514 */
515void
516xfs_iunlock_map_shared(
517 xfs_inode_t *ip,
518 unsigned int lock_mode)
519{
520 xfs_iunlock(ip, lock_mode);
521}
522
523/*
524 * The xfs inode contains 2 locks: a multi-reader lock called the
525 * i_iolock and a multi-reader lock called the i_lock. This routine
526 * allows either or both of the locks to be obtained.
527 *
528 * The 2 locks should always be ordered so that the IO lock is
529 * obtained first in order to prevent deadlock.
530 *
531 * ip -- the inode being locked
532 * lock_flags -- this parameter indicates the inode's locks
533 * to be locked. It can be:
534 * XFS_IOLOCK_SHARED,
535 * XFS_IOLOCK_EXCL,
536 * XFS_ILOCK_SHARED,
537 * XFS_ILOCK_EXCL,
538 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
539 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
540 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
541 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
542 */
543void
544xfs_ilock(
545 xfs_inode_t *ip,
546 uint lock_flags)
547{
548 /*
549 * You can't set both SHARED and EXCL for the same lock,
550 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
551 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
552 */
553 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
554 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
555 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
556 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
557 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
558
559 if (lock_flags & XFS_IOLOCK_EXCL)
560 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
561 else if (lock_flags & XFS_IOLOCK_SHARED)
562 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
563
564 if (lock_flags & XFS_ILOCK_EXCL)
565 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
566 else if (lock_flags & XFS_ILOCK_SHARED)
567 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
568
569 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
570}
571
572/*
573 * This is just like xfs_ilock(), except that the caller
574 * is guaranteed not to sleep. It returns 1 if it gets
575 * the requested locks and 0 otherwise. If the IO lock is
576 * obtained but the inode lock cannot be, then the IO lock
577 * is dropped before returning.
578 *
579 * ip -- the inode being locked
580 * lock_flags -- this parameter indicates the inode's locks to be
581 * to be locked. See the comment for xfs_ilock() for a list
582 * of valid values.
583 */
584int
585xfs_ilock_nowait(
586 xfs_inode_t *ip,
587 uint lock_flags)
588{
589 /*
590 * You can't set both SHARED and EXCL for the same lock,
591 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
592 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
593 */
594 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
595 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
596 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
597 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
598 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
599
600 if (lock_flags & XFS_IOLOCK_EXCL) {
601 if (!mrtryupdate(&ip->i_iolock))
602 goto out;
603 } else if (lock_flags & XFS_IOLOCK_SHARED) {
604 if (!mrtryaccess(&ip->i_iolock))
605 goto out;
606 }
607 if (lock_flags & XFS_ILOCK_EXCL) {
608 if (!mrtryupdate(&ip->i_lock))
609 goto out_undo_iolock;
610 } else if (lock_flags & XFS_ILOCK_SHARED) {
611 if (!mrtryaccess(&ip->i_lock))
612 goto out_undo_iolock;
613 }
614 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
615 return 1;
616
617 out_undo_iolock:
618 if (lock_flags & XFS_IOLOCK_EXCL)
619 mrunlock_excl(&ip->i_iolock);
620 else if (lock_flags & XFS_IOLOCK_SHARED)
621 mrunlock_shared(&ip->i_iolock);
622 out:
623 return 0;
624}
625
626/*
627 * xfs_iunlock() is used to drop the inode locks acquired with
628 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
629 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
630 * that we know which locks to drop.
631 *
632 * ip -- the inode being unlocked
633 * lock_flags -- this parameter indicates the inode's locks to be
634 * to be unlocked. See the comment for xfs_ilock() for a list
635 * of valid values for this parameter.
636 *
637 */
638void
639xfs_iunlock(
640 xfs_inode_t *ip,
641 uint lock_flags)
642{
643 /*
644 * You can't set both SHARED and EXCL for the same lock,
645 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
646 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
647 */
648 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
649 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
650 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
651 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
652 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY |
653 XFS_LOCK_DEP_MASK)) == 0);
654 ASSERT(lock_flags != 0);
655
656 if (lock_flags & XFS_IOLOCK_EXCL)
657 mrunlock_excl(&ip->i_iolock);
658 else if (lock_flags & XFS_IOLOCK_SHARED)
659 mrunlock_shared(&ip->i_iolock);
660
661 if (lock_flags & XFS_ILOCK_EXCL)
662 mrunlock_excl(&ip->i_lock);
663 else if (lock_flags & XFS_ILOCK_SHARED)
664 mrunlock_shared(&ip->i_lock);
665
666 if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) &&
667 !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) {
668 /*
669 * Let the AIL know that this item has been unlocked in case
670 * it is in the AIL and anyone is waiting on it. Don't do
671 * this if the caller has asked us not to.
672 */
673 xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
674 (xfs_log_item_t*)(ip->i_itemp));
675 }
676 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
677}
678
679/*
680 * give up write locks. the i/o lock cannot be held nested
681 * if it is being demoted.
682 */
683void
684xfs_ilock_demote(
685 xfs_inode_t *ip,
686 uint lock_flags)
687{
688 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
689 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
690
691 if (lock_flags & XFS_ILOCK_EXCL)
692 mrdemote(&ip->i_lock);
693 if (lock_flags & XFS_IOLOCK_EXCL)
694 mrdemote(&ip->i_iolock);
695
696 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
697}
698
699#ifdef DEBUG
700int
701xfs_isilocked(
702 xfs_inode_t *ip,
703 uint lock_flags)
704{
705 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
706 if (!(lock_flags & XFS_ILOCK_SHARED))
707 return !!ip->i_lock.mr_writer;
708 return rwsem_is_locked(&ip->i_lock.mr_lock);
709 }
710
711 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
712 if (!(lock_flags & XFS_IOLOCK_SHARED))
713 return !!ip->i_iolock.mr_writer;
714 return rwsem_is_locked(&ip->i_iolock.mr_lock);
715 }
716
717 ASSERT(0);
718 return 0;
719}
720#endif
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
new file mode 100644
index 00000000000..8671a0b3264
--- /dev/null
+++ b/fs/xfs/xfs_qm_stats.c
@@ -0,0 +1,105 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_alloc.h"
27#include "xfs_quota.h"
28#include "xfs_mount.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_inode.h"
31#include "xfs_itable.h"
32#include "xfs_bmap.h"
33#include "xfs_rtalloc.h"
34#include "xfs_error.h"
35#include "xfs_attr.h"
36#include "xfs_buf_item.h"
37#include "xfs_qm.h"
38
39struct xqmstats xqmstats;
40
41static int xqm_proc_show(struct seq_file *m, void *v)
42{
43 /* maximum; incore; ratio free to inuse; freelist */
44 seq_printf(m, "%d\t%d\t%d\t%u\n",
45 ndquot,
46 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
47 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
48 xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
49 return 0;
50}
51
52static int xqm_proc_open(struct inode *inode, struct file *file)
53{
54 return single_open(file, xqm_proc_show, NULL);
55}
56
57static const struct file_operations xqm_proc_fops = {
58 .owner = THIS_MODULE,
59 .open = xqm_proc_open,
60 .read = seq_read,
61 .llseek = seq_lseek,
62 .release = single_release,
63};
64
65static int xqmstat_proc_show(struct seq_file *m, void *v)
66{
67 /* quota performance statistics */
68 seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
69 xqmstats.xs_qm_dqreclaims,
70 xqmstats.xs_qm_dqreclaim_misses,
71 xqmstats.xs_qm_dquot_dups,
72 xqmstats.xs_qm_dqcachemisses,
73 xqmstats.xs_qm_dqcachehits,
74 xqmstats.xs_qm_dqwants,
75 xqmstats.xs_qm_dqshake_reclaims,
76 xqmstats.xs_qm_dqinact_reclaims);
77 return 0;
78}
79
80static int xqmstat_proc_open(struct inode *inode, struct file *file)
81{
82 return single_open(file, xqmstat_proc_show, NULL);
83}
84
85static const struct file_operations xqmstat_proc_fops = {
86 .owner = THIS_MODULE,
87 .open = xqmstat_proc_open,
88 .read = seq_read,
89 .llseek = seq_lseek,
90 .release = single_release,
91};
92
93void
94xfs_qm_init_procfs(void)
95{
96 proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
97 proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
98}
99
100void
101xfs_qm_cleanup_procfs(void)
102{
103 remove_proc_entry("fs/xfs/xqm", NULL);
104 remove_proc_entry("fs/xfs/xqmstat", NULL);
105}
diff --git a/fs/xfs/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
new file mode 100644
index 00000000000..5b964fc0dc0
--- /dev/null
+++ b/fs/xfs/xfs_qm_stats.h
@@ -0,0 +1,53 @@
1/*
2 * Copyright (c) 2002 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_QM_STATS_H__
19#define __XFS_QM_STATS_H__
20
21#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
22
23/*
24 * XQM global statistics
25 */
26struct xqmstats {
27 __uint32_t xs_qm_dqreclaims;
28 __uint32_t xs_qm_dqreclaim_misses;
29 __uint32_t xs_qm_dquot_dups;
30 __uint32_t xs_qm_dqcachemisses;
31 __uint32_t xs_qm_dqcachehits;
32 __uint32_t xs_qm_dqwants;
33 __uint32_t xs_qm_dqshake_reclaims;
34 __uint32_t xs_qm_dqinact_reclaims;
35};
36
37extern struct xqmstats xqmstats;
38
39# define XQM_STATS_INC(count) ( (count)++ )
40
41extern void xfs_qm_init_procfs(void);
42extern void xfs_qm_cleanup_procfs(void);
43
44#else
45
46# define XQM_STATS_INC(count) do { } while (0)
47
48static inline void xfs_qm_init_procfs(void) { };
49static inline void xfs_qm_cleanup_procfs(void) { };
50
51#endif
52
53#endif /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
new file mode 100644
index 00000000000..c96a8a05ac0
--- /dev/null
+++ b/fs/xfs/xfs_rw.c
@@ -0,0 +1,175 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_dinode.h"
30#include "xfs_inode.h"
31#include "xfs_error.h"
32#include "xfs_rw.h"
33
34/*
35 * Force a shutdown of the filesystem instantly while keeping
36 * the filesystem consistent. We don't do an unmount here; just shutdown
37 * the shop, make sure that absolutely nothing persistent happens to
38 * this filesystem after this point.
39 */
40void
41xfs_do_force_shutdown(
42 xfs_mount_t *mp,
43 int flags,
44 char *fname,
45 int lnnum)
46{
47 int logerror;
48
49 logerror = flags & SHUTDOWN_LOG_IO_ERROR;
50
51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
52 xfs_notice(mp,
53 "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
54 __func__, flags, lnnum, fname, __return_address);
55 }
56 /*
57 * No need to duplicate efforts.
58 */
59 if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
60 return;
61
62 /*
63 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
64 * queue up anybody new on the log reservations, and wakes up
65 * everybody who's sleeping on log reservations to tell them
66 * the bad news.
67 */
68 if (xfs_log_force_umount(mp, logerror))
69 return;
70
71 if (flags & SHUTDOWN_CORRUPT_INCORE) {
72 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
73 "Corruption of in-memory data detected. Shutting down filesystem");
74 if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
75 xfs_stack_trace();
76 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
77 if (logerror) {
78 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
79 "Log I/O Error Detected. Shutting down filesystem");
80 } else if (flags & SHUTDOWN_DEVICE_REQ) {
81 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
82 "All device paths lost. Shutting down filesystem");
83 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
84 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
85 "I/O Error Detected. Shutting down filesystem");
86 }
87 }
88 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
89 xfs_alert(mp,
90 "Please umount the filesystem and rectify the problem(s)");
91 }
92}
93
94/*
95 * Prints out an ALERT message about I/O error.
96 */
97void
98xfs_ioerror_alert(
99 char *func,
100 struct xfs_mount *mp,
101 xfs_buf_t *bp,
102 xfs_daddr_t blkno)
103{
104 xfs_alert(mp,
105 "I/O error occurred: meta-data dev %s block 0x%llx"
106 " (\"%s\") error %d buf count %zd",
107 xfs_buf_target_name(bp->b_target),
108 (__uint64_t)blkno, func,
109 bp->b_error, XFS_BUF_COUNT(bp));
110}
111
112/*
113 * This isn't an absolute requirement, but it is
114 * just a good idea to call xfs_read_buf instead of
115 * directly doing a read_buf call. For one, we shouldn't
116 * be doing this disk read if we are in SHUTDOWN state anyway,
117 * so this stops that from happening. Secondly, this does all
118 * the error checking stuff and the brelse if appropriate for
119 * the caller, so the code can be a little leaner.
120 */
121
122int
123xfs_read_buf(
124 struct xfs_mount *mp,
125 xfs_buftarg_t *target,
126 xfs_daddr_t blkno,
127 int len,
128 uint flags,
129 xfs_buf_t **bpp)
130{
131 xfs_buf_t *bp;
132 int error;
133
134 if (!flags)
135 flags = XBF_LOCK | XBF_MAPPED;
136
137 bp = xfs_buf_read(target, blkno, len, flags);
138 if (!bp)
139 return XFS_ERROR(EIO);
140 error = bp->b_error;
141 if (!error && !XFS_FORCED_SHUTDOWN(mp)) {
142 *bpp = bp;
143 } else {
144 *bpp = NULL;
145 if (error) {
146 xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
147 } else {
148 error = XFS_ERROR(EIO);
149 }
150 if (bp) {
151 XFS_BUF_UNDONE(bp);
152 XFS_BUF_UNDELAYWRITE(bp);
153 XFS_BUF_STALE(bp);
154 /*
155 * brelse clears B_ERROR and b_error
156 */
157 xfs_buf_relse(bp);
158 }
159 }
160 return (error);
161}
162
163/*
164 * helper function to extract extent size hint from inode
165 */
166xfs_extlen_t
167xfs_get_extsz_hint(
168 struct xfs_inode *ip)
169{
170 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
171 return ip->i_d.di_extsize;
172 if (XFS_IS_REALTIME_INODE(ip))
173 return ip->i_mount->m_sb.sb_rextsize;
174 return 0;
175}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
new file mode 100644
index 00000000000..11c41ec6ed7
--- /dev/null
+++ b/fs/xfs/xfs_rw.h
@@ -0,0 +1,49 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_RW_H__
19#define __XFS_RW_H__
20
21struct xfs_buf;
22struct xfs_inode;
23struct xfs_mount;
24
25/*
26 * Convert the given file system block to a disk block.
27 * We have to treat it differently based on whether the
28 * file is a real time file or not, because the bmap code
29 * does.
30 */
31static inline xfs_daddr_t
32xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
33{
34 return (XFS_IS_REALTIME_INODE(ip) ? \
35 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
36 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
37}
38
39/*
40 * Prototypes for functions in xfs_rw.c.
41 */
42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
43 xfs_daddr_t blkno, int len, uint flags,
44 struct xfs_buf **bpp);
45extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
46 xfs_buf_t *bp, xfs_daddr_t blkno);
47extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
48
49#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
new file mode 100644
index 00000000000..4604f90f86a
--- /dev/null
+++ b/fs/xfs/xfs_sync.c
@@ -0,0 +1,1065 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_inode.h"
31#include "xfs_dinode.h"
32#include "xfs_error.h"
33#include "xfs_filestream.h"
34#include "xfs_vnodeops.h"
35#include "xfs_inode_item.h"
36#include "xfs_quota.h"
37#include "xfs_trace.h"
38#include "xfs_fsops.h"
39
40#include <linux/kthread.h>
41#include <linux/freezer.h>
42
43struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
44
45/*
46 * The inode lookup is done in batches to keep the amount of lock traffic and
47 * radix tree lookups to a minimum. The batch size is a trade off between
48 * lookup reduction and stack usage. This is in the reclaim path, so we can't
49 * be too greedy.
50 */
51#define XFS_LOOKUP_BATCH 32
52
53STATIC int
54xfs_inode_ag_walk_grab(
55 struct xfs_inode *ip)
56{
57 struct inode *inode = VFS_I(ip);
58
59 ASSERT(rcu_read_lock_held());
60
61 /*
62 * check for stale RCU freed inode
63 *
64 * If the inode has been reallocated, it doesn't matter if it's not in
65 * the AG we are walking - we are walking for writeback, so if it
66 * passes all the "valid inode" checks and is dirty, then we'll write
67 * it back anyway. If it has been reallocated and still being
68 * initialised, the XFS_INEW check below will catch it.
69 */
70 spin_lock(&ip->i_flags_lock);
71 if (!ip->i_ino)
72 goto out_unlock_noent;
73
74 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
75 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
76 goto out_unlock_noent;
77 spin_unlock(&ip->i_flags_lock);
78
79 /* nothing to sync during shutdown */
80 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
81 return EFSCORRUPTED;
82
83 /* If we can't grab the inode, it must on it's way to reclaim. */
84 if (!igrab(inode))
85 return ENOENT;
86
87 if (is_bad_inode(inode)) {
88 IRELE(ip);
89 return ENOENT;
90 }
91
92 /* inode is valid */
93 return 0;
94
95out_unlock_noent:
96 spin_unlock(&ip->i_flags_lock);
97 return ENOENT;
98}
99
100STATIC int
101xfs_inode_ag_walk(
102 struct xfs_mount *mp,
103 struct xfs_perag *pag,
104 int (*execute)(struct xfs_inode *ip,
105 struct xfs_perag *pag, int flags),
106 int flags)
107{
108 uint32_t first_index;
109 int last_error = 0;
110 int skipped;
111 int done;
112 int nr_found;
113
114restart:
115 done = 0;
116 skipped = 0;
117 first_index = 0;
118 nr_found = 0;
119 do {
120 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
121 int error = 0;
122 int i;
123
124 rcu_read_lock();
125 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
126 (void **)batch, first_index,
127 XFS_LOOKUP_BATCH);
128 if (!nr_found) {
129 rcu_read_unlock();
130 break;
131 }
132
133 /*
134 * Grab the inodes before we drop the lock. if we found
135 * nothing, nr == 0 and the loop will be skipped.
136 */
137 for (i = 0; i < nr_found; i++) {
138 struct xfs_inode *ip = batch[i];
139
140 if (done || xfs_inode_ag_walk_grab(ip))
141 batch[i] = NULL;
142
143 /*
144 * Update the index for the next lookup. Catch
145 * overflows into the next AG range which can occur if
146 * we have inodes in the last block of the AG and we
147 * are currently pointing to the last inode.
148 *
149 * Because we may see inodes that are from the wrong AG
150 * due to RCU freeing and reallocation, only update the
151 * index if it lies in this AG. It was a race that lead
152 * us to see this inode, so another lookup from the
153 * same index will not find it again.
154 */
155 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
156 continue;
157 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
158 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
159 done = 1;
160 }
161
162 /* unlock now we've grabbed the inodes. */
163 rcu_read_unlock();
164
165 for (i = 0; i < nr_found; i++) {
166 if (!batch[i])
167 continue;
168 error = execute(batch[i], pag, flags);
169 IRELE(batch[i]);
170 if (error == EAGAIN) {
171 skipped++;
172 continue;
173 }
174 if (error && last_error != EFSCORRUPTED)
175 last_error = error;
176 }
177
178 /* bail out if the filesystem is corrupted. */
179 if (error == EFSCORRUPTED)
180 break;
181
182 cond_resched();
183
184 } while (nr_found && !done);
185
186 if (skipped) {
187 delay(1);
188 goto restart;
189 }
190 return last_error;
191}
192
193int
194xfs_inode_ag_iterator(
195 struct xfs_mount *mp,
196 int (*execute)(struct xfs_inode *ip,
197 struct xfs_perag *pag, int flags),
198 int flags)
199{
200 struct xfs_perag *pag;
201 int error = 0;
202 int last_error = 0;
203 xfs_agnumber_t ag;
204
205 ag = 0;
206 while ((pag = xfs_perag_get(mp, ag))) {
207 ag = pag->pag_agno + 1;
208 error = xfs_inode_ag_walk(mp, pag, execute, flags);
209 xfs_perag_put(pag);
210 if (error) {
211 last_error = error;
212 if (error == EFSCORRUPTED)
213 break;
214 }
215 }
216 return XFS_ERROR(last_error);
217}
218
219STATIC int
220xfs_sync_inode_data(
221 struct xfs_inode *ip,
222 struct xfs_perag *pag,
223 int flags)
224{
225 struct inode *inode = VFS_I(ip);
226 struct address_space *mapping = inode->i_mapping;
227 int error = 0;
228
229 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
230 goto out_wait;
231
232 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
233 if (flags & SYNC_TRYLOCK)
234 goto out_wait;
235 xfs_ilock(ip, XFS_IOLOCK_SHARED);
236 }
237
238 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
239 0 : XBF_ASYNC, FI_NONE);
240 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241
242 out_wait:
243 if (flags & SYNC_WAIT)
244 xfs_ioend_wait(ip);
245 return error;
246}
247
248STATIC int
249xfs_sync_inode_attr(
250 struct xfs_inode *ip,
251 struct xfs_perag *pag,
252 int flags)
253{
254 int error = 0;
255
256 xfs_ilock(ip, XFS_ILOCK_SHARED);
257 if (xfs_inode_clean(ip))
258 goto out_unlock;
259 if (!xfs_iflock_nowait(ip)) {
260 if (!(flags & SYNC_WAIT))
261 goto out_unlock;
262 xfs_iflock(ip);
263 }
264
265 if (xfs_inode_clean(ip)) {
266 xfs_ifunlock(ip);
267 goto out_unlock;
268 }
269
270 error = xfs_iflush(ip, flags);
271
272 /*
273 * We don't want to try again on non-blocking flushes that can't run
274 * again immediately. If an inode really must be written, then that's
275 * what the SYNC_WAIT flag is for.
276 */
277 if (error == EAGAIN) {
278 ASSERT(!(flags & SYNC_WAIT));
279 error = 0;
280 }
281
282 out_unlock:
283 xfs_iunlock(ip, XFS_ILOCK_SHARED);
284 return error;
285}
286
287/*
288 * Write out pagecache data for the whole filesystem.
289 */
290STATIC int
291xfs_sync_data(
292 struct xfs_mount *mp,
293 int flags)
294{
295 int error;
296
297 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
298
299 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
300 if (error)
301 return XFS_ERROR(error);
302
303 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
304 return 0;
305}
306
307/*
308 * Write out inode metadata (attributes) for the whole filesystem.
309 */
310STATIC int
311xfs_sync_attr(
312 struct xfs_mount *mp,
313 int flags)
314{
315 ASSERT((flags & ~SYNC_WAIT) == 0);
316
317 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
318}
319
320STATIC int
321xfs_sync_fsdata(
322 struct xfs_mount *mp)
323{
324 struct xfs_buf *bp;
325
326 /*
327 * If the buffer is pinned then push on the log so we won't get stuck
328 * waiting in the write for someone, maybe ourselves, to flush the log.
329 *
330 * Even though we just pushed the log above, we did not have the
331 * superblock buffer locked at that point so it can become pinned in
332 * between there and here.
333 */
334 bp = xfs_getsb(mp, 0);
335 if (xfs_buf_ispinned(bp))
336 xfs_log_force(mp, 0);
337
338 return xfs_bwrite(mp, bp);
339}
340
341/*
342 * When remounting a filesystem read-only or freezing the filesystem, we have
343 * two phases to execute. This first phase is syncing the data before we
344 * quiesce the filesystem, and the second is flushing all the inodes out after
345 * we've waited for all the transactions created by the first phase to
346 * complete. The second phase ensures that the inodes are written to their
347 * location on disk rather than just existing in transactions in the log. This
348 * means after a quiesce there is no log replay required to write the inodes to
349 * disk (this is the main difference between a sync and a quiesce).
350 */
351/*
352 * First stage of freeze - no writers will make progress now we are here,
353 * so we flush delwri and delalloc buffers here, then wait for all I/O to
354 * complete. Data is frozen at that point. Metadata is not frozen,
355 * transactions can still occur here so don't bother flushing the buftarg
356 * because it'll just get dirty again.
357 */
358int
359xfs_quiesce_data(
360 struct xfs_mount *mp)
361{
362 int error, error2 = 0;
363
364 xfs_qm_sync(mp, SYNC_TRYLOCK);
365 xfs_qm_sync(mp, SYNC_WAIT);
366
367 /* force out the newly dirtied log buffers */
368 xfs_log_force(mp, XFS_LOG_SYNC);
369
370 /* write superblock and hoover up shutdown errors */
371 error = xfs_sync_fsdata(mp);
372
373 /* make sure all delwri buffers are written out */
374 xfs_flush_buftarg(mp->m_ddev_targp, 1);
375
376 /* mark the log as covered if needed */
377 if (xfs_log_need_covered(mp))
378 error2 = xfs_fs_log_dummy(mp);
379
380 /* flush data-only devices */
381 if (mp->m_rtdev_targp)
382 XFS_bflush(mp->m_rtdev_targp);
383
384 return error ? error : error2;
385}
386
387STATIC void
388xfs_quiesce_fs(
389 struct xfs_mount *mp)
390{
391 int count = 0, pincount;
392
393 xfs_reclaim_inodes(mp, 0);
394 xfs_flush_buftarg(mp->m_ddev_targp, 0);
395
396 /*
397 * This loop must run at least twice. The first instance of the loop
398 * will flush most meta data but that will generate more meta data
399 * (typically directory updates). Which then must be flushed and
400 * logged before we can write the unmount record. We also so sync
401 * reclaim of inodes to catch any that the above delwri flush skipped.
402 */
403 do {
404 xfs_reclaim_inodes(mp, SYNC_WAIT);
405 xfs_sync_attr(mp, SYNC_WAIT);
406 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
407 if (!pincount) {
408 delay(50);
409 count++;
410 }
411 } while (count < 2);
412}
413
414/*
415 * Second stage of a quiesce. The data is already synced, now we have to take
416 * care of the metadata. New transactions are already blocked, so we need to
417 * wait for any remaining transactions to drain out before proceeding.
418 */
419void
420xfs_quiesce_attr(
421 struct xfs_mount *mp)
422{
423 int error = 0;
424
425 /* wait for all modifications to complete */
426 while (atomic_read(&mp->m_active_trans) > 0)
427 delay(100);
428
429 /* flush inodes and push all remaining buffers out to disk */
430 xfs_quiesce_fs(mp);
431
432 /*
433 * Just warn here till VFS can correctly support
434 * read-only remount without racing.
435 */
436 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
437
438 /* Push the superblock and write an unmount record */
439 error = xfs_log_sbcount(mp);
440 if (error)
441 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
442 "Frozen image may not be consistent.");
443 xfs_log_unmount_write(mp);
444 xfs_unmountfs_writesb(mp);
445}
446
447static void
448xfs_syncd_queue_sync(
449 struct xfs_mount *mp)
450{
451 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
452 msecs_to_jiffies(xfs_syncd_centisecs * 10));
453}
454
455/*
456 * Every sync period we need to unpin all items, reclaim inodes and sync
457 * disk quotas. We might need to cover the log to indicate that the
458 * filesystem is idle and not frozen.
459 */
460STATIC void
461xfs_sync_worker(
462 struct work_struct *work)
463{
464 struct xfs_mount *mp = container_of(to_delayed_work(work),
465 struct xfs_mount, m_sync_work);
466 int error;
467
468 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
469 /* dgc: errors ignored here */
470 if (mp->m_super->s_frozen == SB_UNFROZEN &&
471 xfs_log_need_covered(mp))
472 error = xfs_fs_log_dummy(mp);
473 else
474 xfs_log_force(mp, 0);
475 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
476
477 /* start pushing all the metadata that is currently dirty */
478 xfs_ail_push_all(mp->m_ail);
479 }
480
481 /* queue us up again */
482 xfs_syncd_queue_sync(mp);
483}
484
485/*
486 * Queue a new inode reclaim pass if there are reclaimable inodes and there
487 * isn't a reclaim pass already in progress. By default it runs every 5s based
488 * on the xfs syncd work default of 30s. Perhaps this should have it's own
489 * tunable, but that can be done if this method proves to be ineffective or too
490 * aggressive.
491 */
492static void
493xfs_syncd_queue_reclaim(
494 struct xfs_mount *mp)
495{
496
497 /*
498 * We can have inodes enter reclaim after we've shut down the syncd
499 * workqueue during unmount, so don't allow reclaim work to be queued
500 * during unmount.
501 */
502 if (!(mp->m_super->s_flags & MS_ACTIVE))
503 return;
504
505 rcu_read_lock();
506 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
507 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
508 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
509 }
510 rcu_read_unlock();
511}
512
513/*
514 * This is a fast pass over the inode cache to try to get reclaim moving on as
515 * many inodes as possible in a short period of time. It kicks itself every few
516 * seconds, as well as being kicked by the inode cache shrinker when memory
517 * goes low. It scans as quickly as possible avoiding locked inodes or those
518 * already being flushed, and once done schedules a future pass.
519 */
520STATIC void
521xfs_reclaim_worker(
522 struct work_struct *work)
523{
524 struct xfs_mount *mp = container_of(to_delayed_work(work),
525 struct xfs_mount, m_reclaim_work);
526
527 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
528 xfs_syncd_queue_reclaim(mp);
529}
530
531/*
532 * Flush delayed allocate data, attempting to free up reserved space
533 * from existing allocations. At this point a new allocation attempt
534 * has failed with ENOSPC and we are in the process of scratching our
535 * heads, looking about for more room.
536 *
537 * Queue a new data flush if there isn't one already in progress and
538 * wait for completion of the flush. This means that we only ever have one
539 * inode flush in progress no matter how many ENOSPC events are occurring and
540 * so will prevent the system from bogging down due to every concurrent
541 * ENOSPC event scanning all the active inodes in the system for writeback.
542 */
543void
544xfs_flush_inodes(
545 struct xfs_inode *ip)
546{
547 struct xfs_mount *mp = ip->i_mount;
548
549 queue_work(xfs_syncd_wq, &mp->m_flush_work);
550 flush_work_sync(&mp->m_flush_work);
551}
552
553STATIC void
554xfs_flush_worker(
555 struct work_struct *work)
556{
557 struct xfs_mount *mp = container_of(work,
558 struct xfs_mount, m_flush_work);
559
560 xfs_sync_data(mp, SYNC_TRYLOCK);
561 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
562}
563
564int
565xfs_syncd_init(
566 struct xfs_mount *mp)
567{
568 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
569 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
570 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
571
572 xfs_syncd_queue_sync(mp);
573 xfs_syncd_queue_reclaim(mp);
574
575 return 0;
576}
577
578void
579xfs_syncd_stop(
580 struct xfs_mount *mp)
581{
582 cancel_delayed_work_sync(&mp->m_sync_work);
583 cancel_delayed_work_sync(&mp->m_reclaim_work);
584 cancel_work_sync(&mp->m_flush_work);
585}
586
587void
588__xfs_inode_set_reclaim_tag(
589 struct xfs_perag *pag,
590 struct xfs_inode *ip)
591{
592 radix_tree_tag_set(&pag->pag_ici_root,
593 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
594 XFS_ICI_RECLAIM_TAG);
595
596 if (!pag->pag_ici_reclaimable) {
597 /* propagate the reclaim tag up into the perag radix tree */
598 spin_lock(&ip->i_mount->m_perag_lock);
599 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
600 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
601 XFS_ICI_RECLAIM_TAG);
602 spin_unlock(&ip->i_mount->m_perag_lock);
603
604 /* schedule periodic background inode reclaim */
605 xfs_syncd_queue_reclaim(ip->i_mount);
606
607 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
608 -1, _RET_IP_);
609 }
610 pag->pag_ici_reclaimable++;
611}
612
613/*
614 * We set the inode flag atomically with the radix tree tag.
615 * Once we get tag lookups on the radix tree, this inode flag
616 * can go away.
617 */
618void
619xfs_inode_set_reclaim_tag(
620 xfs_inode_t *ip)
621{
622 struct xfs_mount *mp = ip->i_mount;
623 struct xfs_perag *pag;
624
625 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
626 spin_lock(&pag->pag_ici_lock);
627 spin_lock(&ip->i_flags_lock);
628 __xfs_inode_set_reclaim_tag(pag, ip);
629 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
630 spin_unlock(&ip->i_flags_lock);
631 spin_unlock(&pag->pag_ici_lock);
632 xfs_perag_put(pag);
633}
634
635STATIC void
636__xfs_inode_clear_reclaim(
637 xfs_perag_t *pag,
638 xfs_inode_t *ip)
639{
640 pag->pag_ici_reclaimable--;
641 if (!pag->pag_ici_reclaimable) {
642 /* clear the reclaim tag from the perag radix tree */
643 spin_lock(&ip->i_mount->m_perag_lock);
644 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
645 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
646 XFS_ICI_RECLAIM_TAG);
647 spin_unlock(&ip->i_mount->m_perag_lock);
648 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
649 -1, _RET_IP_);
650 }
651}
652
653void
654__xfs_inode_clear_reclaim_tag(
655 xfs_mount_t *mp,
656 xfs_perag_t *pag,
657 xfs_inode_t *ip)
658{
659 radix_tree_tag_clear(&pag->pag_ici_root,
660 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
661 __xfs_inode_clear_reclaim(pag, ip);
662}
663
664/*
665 * Grab the inode for reclaim exclusively.
666 * Return 0 if we grabbed it, non-zero otherwise.
667 */
668STATIC int
669xfs_reclaim_inode_grab(
670 struct xfs_inode *ip,
671 int flags)
672{
673 ASSERT(rcu_read_lock_held());
674
675 /* quick check for stale RCU freed inode */
676 if (!ip->i_ino)
677 return 1;
678
679 /*
680 * do some unlocked checks first to avoid unnecessary lock traffic.
681 * The first is a flush lock check, the second is a already in reclaim
682 * check. Only do these checks if we are not going to block on locks.
683 */
684 if ((flags & SYNC_TRYLOCK) &&
685 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
686 return 1;
687 }
688
689 /*
690 * The radix tree lock here protects a thread in xfs_iget from racing
691 * with us starting reclaim on the inode. Once we have the
692 * XFS_IRECLAIM flag set it will not touch us.
693 *
694 * Due to RCU lookup, we may find inodes that have been freed and only
695 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
696 * aren't candidates for reclaim at all, so we must check the
697 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
698 */
699 spin_lock(&ip->i_flags_lock);
700 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
701 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
702 /* not a reclaim candidate. */
703 spin_unlock(&ip->i_flags_lock);
704 return 1;
705 }
706 __xfs_iflags_set(ip, XFS_IRECLAIM);
707 spin_unlock(&ip->i_flags_lock);
708 return 0;
709}
710
711/*
712 * Inodes in different states need to be treated differently, and the return
713 * value of xfs_iflush is not sufficient to get this right. The following table
714 * lists the inode states and the reclaim actions necessary for non-blocking
715 * reclaim:
716 *
717 *
718 * inode state iflush ret required action
719 * --------------- ---------- ---------------
720 * bad - reclaim
721 * shutdown EIO unpin and reclaim
722 * clean, unpinned 0 reclaim
723 * stale, unpinned 0 reclaim
724 * clean, pinned(*) 0 requeue
725 * stale, pinned EAGAIN requeue
726 * dirty, delwri ok 0 requeue
727 * dirty, delwri blocked EAGAIN requeue
728 * dirty, sync flush 0 reclaim
729 *
730 * (*) dgc: I don't think the clean, pinned state is possible but it gets
731 * handled anyway given the order of checks implemented.
732 *
733 * As can be seen from the table, the return value of xfs_iflush() is not
734 * sufficient to correctly decide the reclaim action here. The checks in
735 * xfs_iflush() might look like duplicates, but they are not.
736 *
737 * Also, because we get the flush lock first, we know that any inode that has
738 * been flushed delwri has had the flush completed by the time we check that
739 * the inode is clean. The clean inode check needs to be done before flushing
740 * the inode delwri otherwise we would loop forever requeuing clean inodes as
741 * we cannot tell apart a successful delwri flush and a clean inode from the
742 * return value of xfs_iflush().
743 *
744 * Note that because the inode is flushed delayed write by background
745 * writeback, the flush lock may already be held here and waiting on it can
746 * result in very long latencies. Hence for sync reclaims, where we wait on the
747 * flush lock, the caller should push out delayed write inodes first before
748 * trying to reclaim them to minimise the amount of time spent waiting. For
749 * background relaim, we just requeue the inode for the next pass.
750 *
751 * Hence the order of actions after gaining the locks should be:
752 * bad => reclaim
753 * shutdown => unpin and reclaim
754 * pinned, delwri => requeue
755 * pinned, sync => unpin
756 * stale => reclaim
757 * clean => reclaim
758 * dirty, delwri => flush and requeue
759 * dirty, sync => flush, wait and reclaim
760 */
761STATIC int
762xfs_reclaim_inode(
763 struct xfs_inode *ip,
764 struct xfs_perag *pag,
765 int sync_mode)
766{
767 int error;
768
769restart:
770 error = 0;
771 xfs_ilock(ip, XFS_ILOCK_EXCL);
772 if (!xfs_iflock_nowait(ip)) {
773 if (!(sync_mode & SYNC_WAIT))
774 goto out;
775 xfs_iflock(ip);
776 }
777
778 if (is_bad_inode(VFS_I(ip)))
779 goto reclaim;
780 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
781 xfs_iunpin_wait(ip);
782 goto reclaim;
783 }
784 if (xfs_ipincount(ip)) {
785 if (!(sync_mode & SYNC_WAIT)) {
786 xfs_ifunlock(ip);
787 goto out;
788 }
789 xfs_iunpin_wait(ip);
790 }
791 if (xfs_iflags_test(ip, XFS_ISTALE))
792 goto reclaim;
793 if (xfs_inode_clean(ip))
794 goto reclaim;
795
796 /*
797 * Now we have an inode that needs flushing.
798 *
799 * We do a nonblocking flush here even if we are doing a SYNC_WAIT
800 * reclaim as we can deadlock with inode cluster removal.
801 * xfs_ifree_cluster() can lock the inode buffer before it locks the
802 * ip->i_lock, and we are doing the exact opposite here. As a result,
803 * doing a blocking xfs_itobp() to get the cluster buffer will result
804 * in an ABBA deadlock with xfs_ifree_cluster().
805 *
806 * As xfs_ifree_cluser() must gather all inodes that are active in the
807 * cache to mark them stale, if we hit this case we don't actually want
808 * to do IO here - we want the inode marked stale so we can simply
809 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
810 * just unlock the inode, back off and try again. Hopefully the next
811 * pass through will see the stale flag set on the inode.
812 */
813 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
814 if (sync_mode & SYNC_WAIT) {
815 if (error == EAGAIN) {
816 xfs_iunlock(ip, XFS_ILOCK_EXCL);
817 /* backoff longer than in xfs_ifree_cluster */
818 delay(2);
819 goto restart;
820 }
821 xfs_iflock(ip);
822 goto reclaim;
823 }
824
825 /*
826 * When we have to flush an inode but don't have SYNC_WAIT set, we
827 * flush the inode out using a delwri buffer and wait for the next
828 * call into reclaim to find it in a clean state instead of waiting for
829 * it now. We also don't return errors here - if the error is transient
830 * then the next reclaim pass will flush the inode, and if the error
831 * is permanent then the next sync reclaim will reclaim the inode and
832 * pass on the error.
833 */
834 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
835 xfs_warn(ip->i_mount,
836 "inode 0x%llx background reclaim flush failed with %d",
837 (long long)ip->i_ino, error);
838 }
839out:
840 xfs_iflags_clear(ip, XFS_IRECLAIM);
841 xfs_iunlock(ip, XFS_ILOCK_EXCL);
842 /*
843 * We could return EAGAIN here to make reclaim rescan the inode tree in
844 * a short while. However, this just burns CPU time scanning the tree
845 * waiting for IO to complete and xfssyncd never goes back to the idle
846 * state. Instead, return 0 to let the next scheduled background reclaim
847 * attempt to reclaim the inode again.
848 */
849 return 0;
850
851reclaim:
852 xfs_ifunlock(ip);
853 xfs_iunlock(ip, XFS_ILOCK_EXCL);
854
855 XFS_STATS_INC(xs_ig_reclaims);
856 /*
857 * Remove the inode from the per-AG radix tree.
858 *
859 * Because radix_tree_delete won't complain even if the item was never
860 * added to the tree assert that it's been there before to catch
861 * problems with the inode life time early on.
862 */
863 spin_lock(&pag->pag_ici_lock);
864 if (!radix_tree_delete(&pag->pag_ici_root,
865 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
866 ASSERT(0);
867 __xfs_inode_clear_reclaim(pag, ip);
868 spin_unlock(&pag->pag_ici_lock);
869
870 /*
871 * Here we do an (almost) spurious inode lock in order to coordinate
872 * with inode cache radix tree lookups. This is because the lookup
873 * can reference the inodes in the cache without taking references.
874 *
875 * We make that OK here by ensuring that we wait until the inode is
876 * unlocked after the lookup before we go ahead and free it. We get
877 * both the ilock and the iolock because the code may need to drop the
878 * ilock one but will still hold the iolock.
879 */
880 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
881 xfs_qm_dqdetach(ip);
882 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
883
884 xfs_inode_free(ip);
885 return error;
886
887}
888
889/*
890 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
891 * corrupted, we still want to try to reclaim all the inodes. If we don't,
892 * then a shut down during filesystem unmount reclaim walk leak all the
893 * unreclaimed inodes.
894 */
895int
896xfs_reclaim_inodes_ag(
897 struct xfs_mount *mp,
898 int flags,
899 int *nr_to_scan)
900{
901 struct xfs_perag *pag;
902 int error = 0;
903 int last_error = 0;
904 xfs_agnumber_t ag;
905 int trylock = flags & SYNC_TRYLOCK;
906 int skipped;
907
908restart:
909 ag = 0;
910 skipped = 0;
911 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
912 unsigned long first_index = 0;
913 int done = 0;
914 int nr_found = 0;
915
916 ag = pag->pag_agno + 1;
917
918 if (trylock) {
919 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
920 skipped++;
921 xfs_perag_put(pag);
922 continue;
923 }
924 first_index = pag->pag_ici_reclaim_cursor;
925 } else
926 mutex_lock(&pag->pag_ici_reclaim_lock);
927
928 do {
929 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
930 int i;
931
932 rcu_read_lock();
933 nr_found = radix_tree_gang_lookup_tag(
934 &pag->pag_ici_root,
935 (void **)batch, first_index,
936 XFS_LOOKUP_BATCH,
937 XFS_ICI_RECLAIM_TAG);
938 if (!nr_found) {
939 done = 1;
940 rcu_read_unlock();
941 break;
942 }
943
944 /*
945 * Grab the inodes before we drop the lock. if we found
946 * nothing, nr == 0 and the loop will be skipped.
947 */
948 for (i = 0; i < nr_found; i++) {
949 struct xfs_inode *ip = batch[i];
950
951 if (done || xfs_reclaim_inode_grab(ip, flags))
952 batch[i] = NULL;
953
954 /*
955 * Update the index for the next lookup. Catch
956 * overflows into the next AG range which can
957 * occur if we have inodes in the last block of
958 * the AG and we are currently pointing to the
959 * last inode.
960 *
961 * Because we may see inodes that are from the
962 * wrong AG due to RCU freeing and
963 * reallocation, only update the index if it
964 * lies in this AG. It was a race that lead us
965 * to see this inode, so another lookup from
966 * the same index will not find it again.
967 */
968 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
969 pag->pag_agno)
970 continue;
971 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
972 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
973 done = 1;
974 }
975
976 /* unlock now we've grabbed the inodes. */
977 rcu_read_unlock();
978
979 for (i = 0; i < nr_found; i++) {
980 if (!batch[i])
981 continue;
982 error = xfs_reclaim_inode(batch[i], pag, flags);
983 if (error && last_error != EFSCORRUPTED)
984 last_error = error;
985 }
986
987 *nr_to_scan -= XFS_LOOKUP_BATCH;
988
989 cond_resched();
990
991 } while (nr_found && !done && *nr_to_scan > 0);
992
993 if (trylock && !done)
994 pag->pag_ici_reclaim_cursor = first_index;
995 else
996 pag->pag_ici_reclaim_cursor = 0;
997 mutex_unlock(&pag->pag_ici_reclaim_lock);
998 xfs_perag_put(pag);
999 }
1000
1001 /*
1002 * if we skipped any AG, and we still have scan count remaining, do
1003 * another pass this time using blocking reclaim semantics (i.e
1004 * waiting on the reclaim locks and ignoring the reclaim cursors). This
1005 * ensure that when we get more reclaimers than AGs we block rather
1006 * than spin trying to execute reclaim.
1007 */
1008 if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
1009 trylock = 0;
1010 goto restart;
1011 }
1012 return XFS_ERROR(last_error);
1013}
1014
1015int
1016xfs_reclaim_inodes(
1017 xfs_mount_t *mp,
1018 int mode)
1019{
1020 int nr_to_scan = INT_MAX;
1021
1022 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
1023}
1024
1025/*
1026 * Scan a certain number of inodes for reclaim.
1027 *
1028 * When called we make sure that there is a background (fast) inode reclaim in
1029 * progress, while we will throttle the speed of reclaim via doing synchronous
1030 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1031 * them to be cleaned, which we hope will not be very long due to the
1032 * background walker having already kicked the IO off on those dirty inodes.
1033 */
1034void
1035xfs_reclaim_inodes_nr(
1036 struct xfs_mount *mp,
1037 int nr_to_scan)
1038{
1039 /* kick background reclaimer and push the AIL */
1040 xfs_syncd_queue_reclaim(mp);
1041 xfs_ail_push_all(mp->m_ail);
1042
1043 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
1044}
1045
1046/*
1047 * Return the number of reclaimable inodes in the filesystem for
1048 * the shrinker to determine how much to reclaim.
1049 */
1050int
1051xfs_reclaim_inodes_count(
1052 struct xfs_mount *mp)
1053{
1054 struct xfs_perag *pag;
1055 xfs_agnumber_t ag = 0;
1056 int reclaimable = 0;
1057
1058 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1059 ag = pag->pag_agno + 1;
1060 reclaimable += pag->pag_ici_reclaimable;
1061 xfs_perag_put(pag);
1062 }
1063 return reclaimable;
1064}
1065
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
new file mode 100644
index 00000000000..941202e7ac6
--- /dev/null
+++ b/fs/xfs/xfs_sync.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef XFS_SYNC_H
19#define XFS_SYNC_H 1
20
21struct xfs_mount;
22struct xfs_perag;
23
24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
26
27extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
28
29int xfs_syncd_init(struct xfs_mount *mp);
30void xfs_syncd_stop(struct xfs_mount *mp);
31
32int xfs_quiesce_data(struct xfs_mount *mp);
33void xfs_quiesce_attr(struct xfs_mount *mp);
34
35void xfs_flush_inodes(struct xfs_inode *ip);
36
37int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
40
41void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
42void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
43void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
44 struct xfs_inode *ip);
45
46int xfs_sync_inode_grab(struct xfs_inode *ip);
47int xfs_inode_ag_iterator(struct xfs_mount *mp,
48 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
49 int flags);
50
51#endif