aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6/xfs_sync.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c447
1 files changed, 306 insertions, 141 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 961df0a22c78..a427c638d909 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -44,6 +44,7 @@
44#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
45#include "xfs_rw.h" 45#include "xfs_rw.h"
46#include "xfs_quota.h" 46#include "xfs_quota.h"
47#include "xfs_trace.h"
47 48
48#include <linux/kthread.h> 49#include <linux/kthread.h>
49#include <linux/freezer.h> 50#include <linux/freezer.h>
@@ -64,7 +65,6 @@ xfs_inode_ag_lookup(
64 * as the tree is sparse and a gang lookup walks to find 65 * as the tree is sparse and a gang lookup walks to find
65 * the number of objects requested. 66 * the number of objects requested.
66 */ 67 */
67 read_lock(&pag->pag_ici_lock);
68 if (tag == XFS_ICI_NO_TAG) { 68 if (tag == XFS_ICI_NO_TAG) {
69 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 69 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
70 (void **)&ip, *first_index, 1); 70 (void **)&ip, *first_index, 1);
@@ -73,7 +73,7 @@ xfs_inode_ag_lookup(
73 (void **)&ip, *first_index, 1, tag); 73 (void **)&ip, *first_index, 1, tag);
74 } 74 }
75 if (!nr_found) 75 if (!nr_found)
76 goto unlock; 76 return NULL;
77 77
78 /* 78 /*
79 * Update the index for the next lookup. Catch overflows 79 * Update the index for the next lookup. Catch overflows
@@ -83,25 +83,21 @@ xfs_inode_ag_lookup(
83 */ 83 */
84 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 84 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
85 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 85 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
86 goto unlock; 86 return NULL;
87
88 return ip; 87 return ip;
89
90unlock:
91 read_unlock(&pag->pag_ici_lock);
92 return NULL;
93} 88}
94 89
95STATIC int 90STATIC int
96xfs_inode_ag_walk( 91xfs_inode_ag_walk(
97 struct xfs_mount *mp, 92 struct xfs_mount *mp,
98 xfs_agnumber_t ag, 93 struct xfs_perag *pag,
99 int (*execute)(struct xfs_inode *ip, 94 int (*execute)(struct xfs_inode *ip,
100 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
101 int flags, 96 int flags,
102 int tag) 97 int tag,
98 int exclusive,
99 int *nr_to_scan)
103{ 100{
104 struct xfs_perag *pag = &mp->m_perag[ag];
105 uint32_t first_index; 101 uint32_t first_index;
106 int last_error = 0; 102 int last_error = 0;
107 int skipped; 103 int skipped;
@@ -113,10 +109,20 @@ restart:
113 int error = 0; 109 int error = 0;
114 xfs_inode_t *ip; 110 xfs_inode_t *ip;
115 111
112 if (exclusive)
113 write_lock(&pag->pag_ici_lock);
114 else
115 read_lock(&pag->pag_ici_lock);
116 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 116 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
117 if (!ip) 117 if (!ip) {
118 if (exclusive)
119 write_unlock(&pag->pag_ici_lock);
120 else
121 read_unlock(&pag->pag_ici_lock);
118 break; 122 break;
123 }
119 124
125 /* execute releases pag->pag_ici_lock */
120 error = execute(ip, pag, flags); 126 error = execute(ip, pag, flags);
121 if (error == EAGAIN) { 127 if (error == EAGAIN) {
122 skipped++; 128 skipped++;
@@ -124,20 +130,17 @@ restart:
124 } 130 }
125 if (error) 131 if (error)
126 last_error = error; 132 last_error = error;
127 /* 133
128 * bail out if the filesystem is corrupted. 134 /* bail out if the filesystem is corrupted. */
129 */
130 if (error == EFSCORRUPTED) 135 if (error == EFSCORRUPTED)
131 break; 136 break;
132 137
133 } while (1); 138 } while ((*nr_to_scan)--);
134 139
135 if (skipped) { 140 if (skipped) {
136 delay(1); 141 delay(1);
137 goto restart; 142 goto restart;
138 } 143 }
139
140 xfs_put_perag(mp, pag);
141 return last_error; 144 return last_error;
142} 145}
143 146
@@ -147,22 +150,37 @@ xfs_inode_ag_iterator(
147 int (*execute)(struct xfs_inode *ip, 150 int (*execute)(struct xfs_inode *ip,
148 struct xfs_perag *pag, int flags), 151 struct xfs_perag *pag, int flags),
149 int flags, 152 int flags,
150 int tag) 153 int tag,
154 int exclusive,
155 int *nr_to_scan)
151{ 156{
152 int error = 0; 157 int error = 0;
153 int last_error = 0; 158 int last_error = 0;
154 xfs_agnumber_t ag; 159 xfs_agnumber_t ag;
160 int nr;
155 161
162 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
156 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 163 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
157 if (!mp->m_perag[ag].pag_ici_init) 164 struct xfs_perag *pag;
165
166 pag = xfs_perag_get(mp, ag);
167 if (!pag->pag_ici_init) {
168 xfs_perag_put(pag);
158 continue; 169 continue;
159 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); 170 }
171 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
172 exclusive, &nr);
173 xfs_perag_put(pag);
160 if (error) { 174 if (error) {
161 last_error = error; 175 last_error = error;
162 if (error == EFSCORRUPTED) 176 if (error == EFSCORRUPTED)
163 break; 177 break;
164 } 178 }
179 if (nr <= 0)
180 break;
165 } 181 }
182 if (nr_to_scan)
183 *nr_to_scan = nr;
166 return XFS_ERROR(last_error); 184 return XFS_ERROR(last_error);
167} 185}
168 186
@@ -173,30 +191,31 @@ xfs_sync_inode_valid(
173 struct xfs_perag *pag) 191 struct xfs_perag *pag)
174{ 192{
175 struct inode *inode = VFS_I(ip); 193 struct inode *inode = VFS_I(ip);
194 int error = EFSCORRUPTED;
176 195
177 /* nothing to sync during shutdown */ 196 /* nothing to sync during shutdown */
178 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 197 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
179 read_unlock(&pag->pag_ici_lock); 198 goto out_unlock;
180 return EFSCORRUPTED;
181 }
182 199
183 /* 200 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
184 * If we can't get a reference on the inode, it must be in reclaim. 201 error = ENOENT;
185 * Leave it for the reclaim code to flush. Also avoid inodes that 202 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
186 * haven't been fully initialised. 203 goto out_unlock;
187 */ 204
188 if (!igrab(inode)) { 205 /* If we can't grab the inode, it must on it's way to reclaim. */
189 read_unlock(&pag->pag_ici_lock); 206 if (!igrab(inode))
190 return ENOENT; 207 goto out_unlock;
191 }
192 read_unlock(&pag->pag_ici_lock);
193 208
194 if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { 209 if (is_bad_inode(inode)) {
195 IRELE(ip); 210 IRELE(ip);
196 return ENOENT; 211 goto out_unlock;
197 } 212 }
198 213
199 return 0; 214 /* inode is valid */
215 error = 0;
216out_unlock:
217 read_unlock(&pag->pag_ici_lock);
218 return error;
200} 219}
201 220
202STATIC int 221STATIC int
@@ -223,7 +242,7 @@ xfs_sync_inode_data(
223 } 242 }
224 243
225 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 244 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
226 0 : XFS_B_ASYNC, FI_NONE); 245 0 : XBF_ASYNC, FI_NONE);
227 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 246 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
228 247
229 out_wait: 248 out_wait:
@@ -259,8 +278,7 @@ xfs_sync_inode_attr(
259 goto out_unlock; 278 goto out_unlock;
260 } 279 }
261 280
262 error = xfs_iflush(ip, (flags & SYNC_WAIT) ? 281 error = xfs_iflush(ip, flags);
263 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
264 282
265 out_unlock: 283 out_unlock:
266 xfs_iunlock(ip, XFS_ILOCK_SHARED); 284 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -281,14 +299,11 @@ xfs_sync_data(
281 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 299 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
282 300
283 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 301 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
284 XFS_ICI_NO_TAG); 302 XFS_ICI_NO_TAG, 0, NULL);
285 if (error) 303 if (error)
286 return XFS_ERROR(error); 304 return XFS_ERROR(error);
287 305
288 xfs_log_force(mp, 0, 306 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
289 (flags & SYNC_WAIT) ?
290 XFS_LOG_FORCE | XFS_LOG_SYNC :
291 XFS_LOG_FORCE);
292 return 0; 307 return 0;
293} 308}
294 309
@@ -303,7 +318,7 @@ xfs_sync_attr(
303 ASSERT((flags & ~SYNC_WAIT) == 0); 318 ASSERT((flags & ~SYNC_WAIT) == 0);
304 319
305 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 320 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
306 XFS_ICI_NO_TAG); 321 XFS_ICI_NO_TAG, 0, NULL);
307} 322}
308 323
309STATIC int 324STATIC int
@@ -314,10 +329,6 @@ xfs_commit_dummy_trans(
314 struct xfs_inode *ip = mp->m_rootip; 329 struct xfs_inode *ip = mp->m_rootip;
315 struct xfs_trans *tp; 330 struct xfs_trans *tp;
316 int error; 331 int error;
317 int log_flags = XFS_LOG_FORCE;
318
319 if (flags & SYNC_WAIT)
320 log_flags |= XFS_LOG_SYNC;
321 332
322 /* 333 /*
323 * Put a dummy transaction in the log to tell recovery 334 * Put a dummy transaction in the log to tell recovery
@@ -339,11 +350,11 @@ xfs_commit_dummy_trans(
339 xfs_iunlock(ip, XFS_ILOCK_EXCL); 350 xfs_iunlock(ip, XFS_ILOCK_EXCL);
340 351
341 /* the log force ensures this transaction is pushed to disk */ 352 /* the log force ensures this transaction is pushed to disk */
342 xfs_log_force(mp, 0, log_flags); 353 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
343 return error; 354 return error;
344} 355}
345 356
346int 357STATIC int
347xfs_sync_fsdata( 358xfs_sync_fsdata(
348 struct xfs_mount *mp, 359 struct xfs_mount *mp,
349 int flags) 360 int flags)
@@ -359,7 +370,7 @@ xfs_sync_fsdata(
359 if (flags & SYNC_TRYLOCK) { 370 if (flags & SYNC_TRYLOCK) {
360 ASSERT(!(flags & SYNC_WAIT)); 371 ASSERT(!(flags & SYNC_WAIT));
361 372
362 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); 373 bp = xfs_getsb(mp, XBF_TRYLOCK);
363 if (!bp) 374 if (!bp)
364 goto out; 375 goto out;
365 376
@@ -379,7 +390,7 @@ xfs_sync_fsdata(
379 * become pinned in between there and here. 390 * become pinned in between there and here.
380 */ 391 */
381 if (XFS_BUF_ISPINNED(bp)) 392 if (XFS_BUF_ISPINNED(bp))
382 xfs_log_force(mp, 0, XFS_LOG_FORCE); 393 xfs_log_force(mp, 0);
383 } 394 }
384 395
385 396
@@ -440,9 +451,6 @@ xfs_quiesce_data(
440 xfs_sync_data(mp, SYNC_WAIT); 451 xfs_sync_data(mp, SYNC_WAIT);
441 xfs_qm_sync(mp, SYNC_WAIT); 452 xfs_qm_sync(mp, SYNC_WAIT);
442 453
443 /* drop inode references pinned by filestreams */
444 xfs_filestream_flush(mp);
445
446 /* write superblock and hoover up shutdown errors */ 454 /* write superblock and hoover up shutdown errors */
447 error = xfs_sync_fsdata(mp, SYNC_WAIT); 455 error = xfs_sync_fsdata(mp, SYNC_WAIT);
448 456
@@ -459,16 +467,18 @@ xfs_quiesce_fs(
459{ 467{
460 int count = 0, pincount; 468 int count = 0, pincount;
461 469
470 xfs_reclaim_inodes(mp, 0);
462 xfs_flush_buftarg(mp->m_ddev_targp, 0); 471 xfs_flush_buftarg(mp->m_ddev_targp, 0);
463 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
464 472
465 /* 473 /*
466 * This loop must run at least twice. The first instance of the loop 474 * This loop must run at least twice. The first instance of the loop
467 * will flush most meta data but that will generate more meta data 475 * will flush most meta data but that will generate more meta data
468 * (typically directory updates). Which then must be flushed and 476 * (typically directory updates). Which then must be flushed and
469 * logged before we can write the unmount record. 477 * logged before we can write the unmount record. We also so sync
478 * reclaim of inodes to catch any that the above delwri flush skipped.
470 */ 479 */
471 do { 480 do {
481 xfs_reclaim_inodes(mp, SYNC_WAIT);
472 xfs_sync_attr(mp, SYNC_WAIT); 482 xfs_sync_attr(mp, SYNC_WAIT);
473 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 483 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
474 if (!pincount) { 484 if (!pincount) {
@@ -567,7 +577,7 @@ xfs_flush_inodes(
567 igrab(inode); 577 igrab(inode);
568 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); 578 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
569 wait_for_completion(&completion); 579 wait_for_completion(&completion);
570 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); 580 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
571} 581}
572 582
573/* 583/*
@@ -583,8 +593,8 @@ xfs_sync_worker(
583 int error; 593 int error;
584 594
585 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 595 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
586 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 596 xfs_log_force(mp, 0);
587 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 597 xfs_reclaim_inodes(mp, 0);
588 /* dgc: errors ignored here */ 598 /* dgc: errors ignored here */
589 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 599 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
590 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 600 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -605,7 +615,8 @@ xfssyncd(
605 set_freezable(); 615 set_freezable();
606 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); 616 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
607 for (;;) { 617 for (;;) {
608 timeleft = schedule_timeout_interruptible(timeleft); 618 if (list_empty(&mp->m_sync_list))
619 timeleft = schedule_timeout_interruptible(timeleft);
609 /* swsusp */ 620 /* swsusp */
610 try_to_freeze(); 621 try_to_freeze();
611 if (kthread_should_stop() && list_empty(&mp->m_sync_list)) 622 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
@@ -625,8 +636,7 @@ xfssyncd(
625 list_add_tail(&mp->m_sync_work.w_list, 636 list_add_tail(&mp->m_sync_work.w_list,
626 &mp->m_sync_list); 637 &mp->m_sync_list);
627 } 638 }
628 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) 639 list_splice_init(&mp->m_sync_list, &tmp);
629 list_move(&work->w_list, &tmp);
630 spin_unlock(&mp->m_sync_lock); 640 spin_unlock(&mp->m_sync_lock);
631 641
632 list_for_each_entry_safe(work, n, &tmp, w_list) { 642 list_for_each_entry_safe(work, n, &tmp, w_list) {
@@ -663,67 +673,6 @@ xfs_syncd_stop(
663 kthread_stop(mp->m_sync_task); 673 kthread_stop(mp->m_sync_task);
664} 674}
665 675
666int
667xfs_reclaim_inode(
668 xfs_inode_t *ip,
669 int locked,
670 int sync_mode)
671{
672 xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
673
674 /* The hash lock here protects a thread in xfs_iget_core from
675 * racing with us on linking the inode back with a vnode.
676 * Once we have the XFS_IRECLAIM flag set it will not touch
677 * us.
678 */
679 write_lock(&pag->pag_ici_lock);
680 spin_lock(&ip->i_flags_lock);
681 if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
682 !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
683 spin_unlock(&ip->i_flags_lock);
684 write_unlock(&pag->pag_ici_lock);
685 if (locked) {
686 xfs_ifunlock(ip);
687 xfs_iunlock(ip, XFS_ILOCK_EXCL);
688 }
689 return -EAGAIN;
690 }
691 __xfs_iflags_set(ip, XFS_IRECLAIM);
692 spin_unlock(&ip->i_flags_lock);
693 write_unlock(&pag->pag_ici_lock);
694 xfs_put_perag(ip->i_mount, pag);
695
696 /*
697 * If the inode is still dirty, then flush it out. If the inode
698 * is not in the AIL, then it will be OK to flush it delwri as
699 * long as xfs_iflush() does not keep any references to the inode.
700 * We leave that decision up to xfs_iflush() since it has the
701 * knowledge of whether it's OK to simply do a delwri flush of
702 * the inode or whether we need to wait until the inode is
703 * pulled from the AIL.
704 * We get the flush lock regardless, though, just to make sure
705 * we don't free it while it is being flushed.
706 */
707 if (!locked) {
708 xfs_ilock(ip, XFS_ILOCK_EXCL);
709 xfs_iflock(ip);
710 }
711
712 /*
713 * In the case of a forced shutdown we rely on xfs_iflush() to
714 * wait for the inode to be unpinned before returning an error.
715 */
716 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
717 /* synchronize with xfs_iflush_done */
718 xfs_iflock(ip);
719 xfs_ifunlock(ip);
720 }
721
722 xfs_iunlock(ip, XFS_ILOCK_EXCL);
723 xfs_ireclaim(ip);
724 return 0;
725}
726
727void 676void
728__xfs_inode_set_reclaim_tag( 677__xfs_inode_set_reclaim_tag(
729 struct xfs_perag *pag, 678 struct xfs_perag *pag,
@@ -732,6 +681,7 @@ __xfs_inode_set_reclaim_tag(
732 radix_tree_tag_set(&pag->pag_ici_root, 681 radix_tree_tag_set(&pag->pag_ici_root,
733 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 682 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
734 XFS_ICI_RECLAIM_TAG); 683 XFS_ICI_RECLAIM_TAG);
684 pag->pag_ici_reclaimable++;
735} 685}
736 686
737/* 687/*
@@ -743,16 +693,17 @@ void
743xfs_inode_set_reclaim_tag( 693xfs_inode_set_reclaim_tag(
744 xfs_inode_t *ip) 694 xfs_inode_t *ip)
745{ 695{
746 xfs_mount_t *mp = ip->i_mount; 696 struct xfs_mount *mp = ip->i_mount;
747 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 697 struct xfs_perag *pag;
748 698
749 read_lock(&pag->pag_ici_lock); 699 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
700 write_lock(&pag->pag_ici_lock);
750 spin_lock(&ip->i_flags_lock); 701 spin_lock(&ip->i_flags_lock);
751 __xfs_inode_set_reclaim_tag(pag, ip); 702 __xfs_inode_set_reclaim_tag(pag, ip);
752 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 703 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
753 spin_unlock(&ip->i_flags_lock); 704 spin_unlock(&ip->i_flags_lock);
754 read_unlock(&pag->pag_ici_lock); 705 write_unlock(&pag->pag_ici_lock);
755 xfs_put_perag(mp, pag); 706 xfs_perag_put(pag);
756} 707}
757 708
758void 709void
@@ -763,22 +714,148 @@ __xfs_inode_clear_reclaim_tag(
763{ 714{
764 radix_tree_tag_clear(&pag->pag_ici_root, 715 radix_tree_tag_clear(&pag->pag_ici_root,
765 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 716 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
717 pag->pag_ici_reclaimable--;
766} 718}
767 719
720/*
721 * Inodes in different states need to be treated differently, and the return
722 * value of xfs_iflush is not sufficient to get this right. The following table
723 * lists the inode states and the reclaim actions necessary for non-blocking
724 * reclaim:
725 *
726 *
727 * inode state iflush ret required action
728 * --------------- ---------- ---------------
729 * bad - reclaim
730 * shutdown EIO unpin and reclaim
731 * clean, unpinned 0 reclaim
732 * stale, unpinned 0 reclaim
733 * clean, pinned(*) 0 requeue
734 * stale, pinned EAGAIN requeue
735 * dirty, delwri ok 0 requeue
736 * dirty, delwri blocked EAGAIN requeue
737 * dirty, sync flush 0 reclaim
738 *
739 * (*) dgc: I don't think the clean, pinned state is possible but it gets
740 * handled anyway given the order of checks implemented.
741 *
742 * As can be seen from the table, the return value of xfs_iflush() is not
743 * sufficient to correctly decide the reclaim action here. The checks in
744 * xfs_iflush() might look like duplicates, but they are not.
745 *
746 * Also, because we get the flush lock first, we know that any inode that has
747 * been flushed delwri has had the flush completed by the time we check that
748 * the inode is clean. The clean inode check needs to be done before flushing
749 * the inode delwri otherwise we would loop forever requeuing clean inodes as
750 * we cannot tell apart a successful delwri flush and a clean inode from the
751 * return value of xfs_iflush().
752 *
753 * Note that because the inode is flushed delayed write by background
754 * writeback, the flush lock may already be held here and waiting on it can
755 * result in very long latencies. Hence for sync reclaims, where we wait on the
756 * flush lock, the caller should push out delayed write inodes first before
757 * trying to reclaim them to minimise the amount of time spent waiting. For
758 * background relaim, we just requeue the inode for the next pass.
759 *
760 * Hence the order of actions after gaining the locks should be:
761 * bad => reclaim
762 * shutdown => unpin and reclaim
763 * pinned, delwri => requeue
764 * pinned, sync => unpin
765 * stale => reclaim
766 * clean => reclaim
767 * dirty, delwri => flush and requeue
768 * dirty, sync => flush, wait and reclaim
769 */
768STATIC int 770STATIC int
769xfs_reclaim_inode_now( 771xfs_reclaim_inode(
770 struct xfs_inode *ip, 772 struct xfs_inode *ip,
771 struct xfs_perag *pag, 773 struct xfs_perag *pag,
772 int flags) 774 int sync_mode)
773{ 775{
774 /* ignore if already under reclaim */ 776 int error = 0;
775 if (xfs_iflags_test(ip, XFS_IRECLAIM)) { 777
776 read_unlock(&pag->pag_ici_lock); 778 /*
779 * The radix tree lock here protects a thread in xfs_iget from racing
780 * with us starting reclaim on the inode. Once we have the
781 * XFS_IRECLAIM flag set it will not touch us.
782 */
783 spin_lock(&ip->i_flags_lock);
784 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
785 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
786 /* ignore as it is already under reclaim */
787 spin_unlock(&ip->i_flags_lock);
788 write_unlock(&pag->pag_ici_lock);
777 return 0; 789 return 0;
778 } 790 }
779 read_unlock(&pag->pag_ici_lock); 791 __xfs_iflags_set(ip, XFS_IRECLAIM);
792 spin_unlock(&ip->i_flags_lock);
793 write_unlock(&pag->pag_ici_lock);
794
795 xfs_ilock(ip, XFS_ILOCK_EXCL);
796 if (!xfs_iflock_nowait(ip)) {
797 if (!(sync_mode & SYNC_WAIT))
798 goto out;
799 xfs_iflock(ip);
800 }
801
802 if (is_bad_inode(VFS_I(ip)))
803 goto reclaim;
804 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
805 xfs_iunpin_wait(ip);
806 goto reclaim;
807 }
808 if (xfs_ipincount(ip)) {
809 if (!(sync_mode & SYNC_WAIT)) {
810 xfs_ifunlock(ip);
811 goto out;
812 }
813 xfs_iunpin_wait(ip);
814 }
815 if (xfs_iflags_test(ip, XFS_ISTALE))
816 goto reclaim;
817 if (xfs_inode_clean(ip))
818 goto reclaim;
819
820 /* Now we have an inode that needs flushing */
821 error = xfs_iflush(ip, sync_mode);
822 if (sync_mode & SYNC_WAIT) {
823 xfs_iflock(ip);
824 goto reclaim;
825 }
826
827 /*
828 * When we have to flush an inode but don't have SYNC_WAIT set, we
829 * flush the inode out using a delwri buffer and wait for the next
830 * call into reclaim to find it in a clean state instead of waiting for
831 * it now. We also don't return errors here - if the error is transient
832 * then the next reclaim pass will flush the inode, and if the error
833 * is permanent then the next sync reclaim will reclaim the inode and
834 * pass on the error.
835 */
836 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
837 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
838 "inode 0x%llx background reclaim flush failed with %d",
839 (long long)ip->i_ino, error);
840 }
841out:
842 xfs_iflags_clear(ip, XFS_IRECLAIM);
843 xfs_iunlock(ip, XFS_ILOCK_EXCL);
844 /*
845 * We could return EAGAIN here to make reclaim rescan the inode tree in
846 * a short while. However, this just burns CPU time scanning the tree
847 * waiting for IO to complete and xfssyncd never goes back to the idle
848 * state. Instead, return 0 to let the next scheduled background reclaim
849 * attempt to reclaim the inode again.
850 */
851 return 0;
852
853reclaim:
854 xfs_ifunlock(ip);
855 xfs_iunlock(ip, XFS_ILOCK_EXCL);
856 xfs_ireclaim(ip);
857 return error;
780 858
781 return xfs_reclaim_inode(ip, 0, flags);
782} 859}
783 860
784int 861int
@@ -786,6 +863,94 @@ xfs_reclaim_inodes(
786 xfs_mount_t *mp, 863 xfs_mount_t *mp,
787 int mode) 864 int mode)
788{ 865{
789 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, 866 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
790 XFS_ICI_RECLAIM_TAG); 867 XFS_ICI_RECLAIM_TAG, 1, NULL);
868}
869
870/*
871 * Shrinker infrastructure.
872 *
873 * This is all far more complex than it needs to be. It adds a global list of
874 * mounts because the shrinkers can only call a global context. We need to make
875 * the shrinkers pass a context to avoid the need for global state.
876 */
877static LIST_HEAD(xfs_mount_list);
878static struct rw_semaphore xfs_mount_list_lock;
879
880static int
881xfs_reclaim_inode_shrink(
882 int nr_to_scan,
883 gfp_t gfp_mask)
884{
885 struct xfs_mount *mp;
886 struct xfs_perag *pag;
887 xfs_agnumber_t ag;
888 int reclaimable = 0;
889
890 if (nr_to_scan) {
891 if (!(gfp_mask & __GFP_FS))
892 return -1;
893
894 down_read(&xfs_mount_list_lock);
895 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
896 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
897 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
898 if (nr_to_scan <= 0)
899 break;
900 }
901 up_read(&xfs_mount_list_lock);
902 }
903
904 down_read(&xfs_mount_list_lock);
905 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
906 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
907
908 pag = xfs_perag_get(mp, ag);
909 if (!pag->pag_ici_init) {
910 xfs_perag_put(pag);
911 continue;
912 }
913 reclaimable += pag->pag_ici_reclaimable;
914 xfs_perag_put(pag);
915 }
916 }
917 up_read(&xfs_mount_list_lock);
918 return reclaimable;
919}
920
921static struct shrinker xfs_inode_shrinker = {
922 .shrink = xfs_reclaim_inode_shrink,
923 .seeks = DEFAULT_SEEKS,
924};
925
926void __init
927xfs_inode_shrinker_init(void)
928{
929 init_rwsem(&xfs_mount_list_lock);
930 register_shrinker(&xfs_inode_shrinker);
931}
932
933void
934xfs_inode_shrinker_destroy(void)
935{
936 ASSERT(list_empty(&xfs_mount_list));
937 unregister_shrinker(&xfs_inode_shrinker);
938}
939
940void
941xfs_inode_shrinker_register(
942 struct xfs_mount *mp)
943{
944 down_write(&xfs_mount_list_lock);
945 list_add_tail(&mp->m_mplist, &xfs_mount_list);
946 up_write(&xfs_mount_list_lock);
947}
948
949void
950xfs_inode_shrinker_unregister(
951 struct xfs_mount *mp)
952{
953 down_write(&xfs_mount_list_lock);
954 list_del(&mp->m_mplist);
955 up_write(&xfs_mount_list_lock);
791} 956}