diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 447 |
1 files changed, 306 insertions, 141 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 961df0a22c78..a427c638d909 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include "xfs_inode_item.h" | 44 | #include "xfs_inode_item.h" |
45 | #include "xfs_rw.h" | 45 | #include "xfs_rw.h" |
46 | #include "xfs_quota.h" | 46 | #include "xfs_quota.h" |
47 | #include "xfs_trace.h" | ||
47 | 48 | ||
48 | #include <linux/kthread.h> | 49 | #include <linux/kthread.h> |
49 | #include <linux/freezer.h> | 50 | #include <linux/freezer.h> |
@@ -64,7 +65,6 @@ xfs_inode_ag_lookup( | |||
64 | * as the tree is sparse and a gang lookup walks to find | 65 | * as the tree is sparse and a gang lookup walks to find |
65 | * the number of objects requested. | 66 | * the number of objects requested. |
66 | */ | 67 | */ |
67 | read_lock(&pag->pag_ici_lock); | ||
68 | if (tag == XFS_ICI_NO_TAG) { | 68 | if (tag == XFS_ICI_NO_TAG) { |
69 | nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, | 69 | nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, |
70 | (void **)&ip, *first_index, 1); | 70 | (void **)&ip, *first_index, 1); |
@@ -73,7 +73,7 @@ xfs_inode_ag_lookup( | |||
73 | (void **)&ip, *first_index, 1, tag); | 73 | (void **)&ip, *first_index, 1, tag); |
74 | } | 74 | } |
75 | if (!nr_found) | 75 | if (!nr_found) |
76 | goto unlock; | 76 | return NULL; |
77 | 77 | ||
78 | /* | 78 | /* |
79 | * Update the index for the next lookup. Catch overflows | 79 | * Update the index for the next lookup. Catch overflows |
@@ -83,25 +83,21 @@ xfs_inode_ag_lookup( | |||
83 | */ | 83 | */ |
84 | *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); | 84 | *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); |
85 | if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) | 85 | if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) |
86 | goto unlock; | 86 | return NULL; |
87 | |||
88 | return ip; | 87 | return ip; |
89 | |||
90 | unlock: | ||
91 | read_unlock(&pag->pag_ici_lock); | ||
92 | return NULL; | ||
93 | } | 88 | } |
94 | 89 | ||
95 | STATIC int | 90 | STATIC int |
96 | xfs_inode_ag_walk( | 91 | xfs_inode_ag_walk( |
97 | struct xfs_mount *mp, | 92 | struct xfs_mount *mp, |
98 | xfs_agnumber_t ag, | 93 | struct xfs_perag *pag, |
99 | int (*execute)(struct xfs_inode *ip, | 94 | int (*execute)(struct xfs_inode *ip, |
100 | struct xfs_perag *pag, int flags), | 95 | struct xfs_perag *pag, int flags), |
101 | int flags, | 96 | int flags, |
102 | int tag) | 97 | int tag, |
98 | int exclusive, | ||
99 | int *nr_to_scan) | ||
103 | { | 100 | { |
104 | struct xfs_perag *pag = &mp->m_perag[ag]; | ||
105 | uint32_t first_index; | 101 | uint32_t first_index; |
106 | int last_error = 0; | 102 | int last_error = 0; |
107 | int skipped; | 103 | int skipped; |
@@ -113,10 +109,20 @@ restart: | |||
113 | int error = 0; | 109 | int error = 0; |
114 | xfs_inode_t *ip; | 110 | xfs_inode_t *ip; |
115 | 111 | ||
112 | if (exclusive) | ||
113 | write_lock(&pag->pag_ici_lock); | ||
114 | else | ||
115 | read_lock(&pag->pag_ici_lock); | ||
116 | ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); | 116 | ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); |
117 | if (!ip) | 117 | if (!ip) { |
118 | if (exclusive) | ||
119 | write_unlock(&pag->pag_ici_lock); | ||
120 | else | ||
121 | read_unlock(&pag->pag_ici_lock); | ||
118 | break; | 122 | break; |
123 | } | ||
119 | 124 | ||
125 | /* execute releases pag->pag_ici_lock */ | ||
120 | error = execute(ip, pag, flags); | 126 | error = execute(ip, pag, flags); |
121 | if (error == EAGAIN) { | 127 | if (error == EAGAIN) { |
122 | skipped++; | 128 | skipped++; |
@@ -124,20 +130,17 @@ restart: | |||
124 | } | 130 | } |
125 | if (error) | 131 | if (error) |
126 | last_error = error; | 132 | last_error = error; |
127 | /* | 133 | |
128 | * bail out if the filesystem is corrupted. | 134 | /* bail out if the filesystem is corrupted. */ |
129 | */ | ||
130 | if (error == EFSCORRUPTED) | 135 | if (error == EFSCORRUPTED) |
131 | break; | 136 | break; |
132 | 137 | ||
133 | } while (1); | 138 | } while ((*nr_to_scan)--); |
134 | 139 | ||
135 | if (skipped) { | 140 | if (skipped) { |
136 | delay(1); | 141 | delay(1); |
137 | goto restart; | 142 | goto restart; |
138 | } | 143 | } |
139 | |||
140 | xfs_put_perag(mp, pag); | ||
141 | return last_error; | 144 | return last_error; |
142 | } | 145 | } |
143 | 146 | ||
@@ -147,22 +150,37 @@ xfs_inode_ag_iterator( | |||
147 | int (*execute)(struct xfs_inode *ip, | 150 | int (*execute)(struct xfs_inode *ip, |
148 | struct xfs_perag *pag, int flags), | 151 | struct xfs_perag *pag, int flags), |
149 | int flags, | 152 | int flags, |
150 | int tag) | 153 | int tag, |
154 | int exclusive, | ||
155 | int *nr_to_scan) | ||
151 | { | 156 | { |
152 | int error = 0; | 157 | int error = 0; |
153 | int last_error = 0; | 158 | int last_error = 0; |
154 | xfs_agnumber_t ag; | 159 | xfs_agnumber_t ag; |
160 | int nr; | ||
155 | 161 | ||
162 | nr = nr_to_scan ? *nr_to_scan : INT_MAX; | ||
156 | for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { | 163 | for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { |
157 | if (!mp->m_perag[ag].pag_ici_init) | 164 | struct xfs_perag *pag; |
165 | |||
166 | pag = xfs_perag_get(mp, ag); | ||
167 | if (!pag->pag_ici_init) { | ||
168 | xfs_perag_put(pag); | ||
158 | continue; | 169 | continue; |
159 | error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); | 170 | } |
171 | error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, | ||
172 | exclusive, &nr); | ||
173 | xfs_perag_put(pag); | ||
160 | if (error) { | 174 | if (error) { |
161 | last_error = error; | 175 | last_error = error; |
162 | if (error == EFSCORRUPTED) | 176 | if (error == EFSCORRUPTED) |
163 | break; | 177 | break; |
164 | } | 178 | } |
179 | if (nr <= 0) | ||
180 | break; | ||
165 | } | 181 | } |
182 | if (nr_to_scan) | ||
183 | *nr_to_scan = nr; | ||
166 | return XFS_ERROR(last_error); | 184 | return XFS_ERROR(last_error); |
167 | } | 185 | } |
168 | 186 | ||
@@ -173,30 +191,31 @@ xfs_sync_inode_valid( | |||
173 | struct xfs_perag *pag) | 191 | struct xfs_perag *pag) |
174 | { | 192 | { |
175 | struct inode *inode = VFS_I(ip); | 193 | struct inode *inode = VFS_I(ip); |
194 | int error = EFSCORRUPTED; | ||
176 | 195 | ||
177 | /* nothing to sync during shutdown */ | 196 | /* nothing to sync during shutdown */ |
178 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { | 197 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) |
179 | read_unlock(&pag->pag_ici_lock); | 198 | goto out_unlock; |
180 | return EFSCORRUPTED; | ||
181 | } | ||
182 | 199 | ||
183 | /* | 200 | /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ |
184 | * If we can't get a reference on the inode, it must be in reclaim. | 201 | error = ENOENT; |
185 | * Leave it for the reclaim code to flush. Also avoid inodes that | 202 | if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) |
186 | * haven't been fully initialised. | 203 | goto out_unlock; |
187 | */ | 204 | |
188 | if (!igrab(inode)) { | 205 | /* If we can't grab the inode, it must on it's way to reclaim. */ |
189 | read_unlock(&pag->pag_ici_lock); | 206 | if (!igrab(inode)) |
190 | return ENOENT; | 207 | goto out_unlock; |
191 | } | ||
192 | read_unlock(&pag->pag_ici_lock); | ||
193 | 208 | ||
194 | if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { | 209 | if (is_bad_inode(inode)) { |
195 | IRELE(ip); | 210 | IRELE(ip); |
196 | return ENOENT; | 211 | goto out_unlock; |
197 | } | 212 | } |
198 | 213 | ||
199 | return 0; | 214 | /* inode is valid */ |
215 | error = 0; | ||
216 | out_unlock: | ||
217 | read_unlock(&pag->pag_ici_lock); | ||
218 | return error; | ||
200 | } | 219 | } |
201 | 220 | ||
202 | STATIC int | 221 | STATIC int |
@@ -223,7 +242,7 @@ xfs_sync_inode_data( | |||
223 | } | 242 | } |
224 | 243 | ||
225 | error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? | 244 | error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? |
226 | 0 : XFS_B_ASYNC, FI_NONE); | 245 | 0 : XBF_ASYNC, FI_NONE); |
227 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); | 246 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); |
228 | 247 | ||
229 | out_wait: | 248 | out_wait: |
@@ -259,8 +278,7 @@ xfs_sync_inode_attr( | |||
259 | goto out_unlock; | 278 | goto out_unlock; |
260 | } | 279 | } |
261 | 280 | ||
262 | error = xfs_iflush(ip, (flags & SYNC_WAIT) ? | 281 | error = xfs_iflush(ip, flags); |
263 | XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI); | ||
264 | 282 | ||
265 | out_unlock: | 283 | out_unlock: |
266 | xfs_iunlock(ip, XFS_ILOCK_SHARED); | 284 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
@@ -281,14 +299,11 @@ xfs_sync_data( | |||
281 | ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); | 299 | ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); |
282 | 300 | ||
283 | error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, | 301 | error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, |
284 | XFS_ICI_NO_TAG); | 302 | XFS_ICI_NO_TAG, 0, NULL); |
285 | if (error) | 303 | if (error) |
286 | return XFS_ERROR(error); | 304 | return XFS_ERROR(error); |
287 | 305 | ||
288 | xfs_log_force(mp, 0, | 306 | xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); |
289 | (flags & SYNC_WAIT) ? | ||
290 | XFS_LOG_FORCE | XFS_LOG_SYNC : | ||
291 | XFS_LOG_FORCE); | ||
292 | return 0; | 307 | return 0; |
293 | } | 308 | } |
294 | 309 | ||
@@ -303,7 +318,7 @@ xfs_sync_attr( | |||
303 | ASSERT((flags & ~SYNC_WAIT) == 0); | 318 | ASSERT((flags & ~SYNC_WAIT) == 0); |
304 | 319 | ||
305 | return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, | 320 | return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, |
306 | XFS_ICI_NO_TAG); | 321 | XFS_ICI_NO_TAG, 0, NULL); |
307 | } | 322 | } |
308 | 323 | ||
309 | STATIC int | 324 | STATIC int |
@@ -314,10 +329,6 @@ xfs_commit_dummy_trans( | |||
314 | struct xfs_inode *ip = mp->m_rootip; | 329 | struct xfs_inode *ip = mp->m_rootip; |
315 | struct xfs_trans *tp; | 330 | struct xfs_trans *tp; |
316 | int error; | 331 | int error; |
317 | int log_flags = XFS_LOG_FORCE; | ||
318 | |||
319 | if (flags & SYNC_WAIT) | ||
320 | log_flags |= XFS_LOG_SYNC; | ||
321 | 332 | ||
322 | /* | 333 | /* |
323 | * Put a dummy transaction in the log to tell recovery | 334 | * Put a dummy transaction in the log to tell recovery |
@@ -339,11 +350,11 @@ xfs_commit_dummy_trans( | |||
339 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 350 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
340 | 351 | ||
341 | /* the log force ensures this transaction is pushed to disk */ | 352 | /* the log force ensures this transaction is pushed to disk */ |
342 | xfs_log_force(mp, 0, log_flags); | 353 | xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); |
343 | return error; | 354 | return error; |
344 | } | 355 | } |
345 | 356 | ||
346 | int | 357 | STATIC int |
347 | xfs_sync_fsdata( | 358 | xfs_sync_fsdata( |
348 | struct xfs_mount *mp, | 359 | struct xfs_mount *mp, |
349 | int flags) | 360 | int flags) |
@@ -359,7 +370,7 @@ xfs_sync_fsdata( | |||
359 | if (flags & SYNC_TRYLOCK) { | 370 | if (flags & SYNC_TRYLOCK) { |
360 | ASSERT(!(flags & SYNC_WAIT)); | 371 | ASSERT(!(flags & SYNC_WAIT)); |
361 | 372 | ||
362 | bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); | 373 | bp = xfs_getsb(mp, XBF_TRYLOCK); |
363 | if (!bp) | 374 | if (!bp) |
364 | goto out; | 375 | goto out; |
365 | 376 | ||
@@ -379,7 +390,7 @@ xfs_sync_fsdata( | |||
379 | * become pinned in between there and here. | 390 | * become pinned in between there and here. |
380 | */ | 391 | */ |
381 | if (XFS_BUF_ISPINNED(bp)) | 392 | if (XFS_BUF_ISPINNED(bp)) |
382 | xfs_log_force(mp, 0, XFS_LOG_FORCE); | 393 | xfs_log_force(mp, 0); |
383 | } | 394 | } |
384 | 395 | ||
385 | 396 | ||
@@ -440,9 +451,6 @@ xfs_quiesce_data( | |||
440 | xfs_sync_data(mp, SYNC_WAIT); | 451 | xfs_sync_data(mp, SYNC_WAIT); |
441 | xfs_qm_sync(mp, SYNC_WAIT); | 452 | xfs_qm_sync(mp, SYNC_WAIT); |
442 | 453 | ||
443 | /* drop inode references pinned by filestreams */ | ||
444 | xfs_filestream_flush(mp); | ||
445 | |||
446 | /* write superblock and hoover up shutdown errors */ | 454 | /* write superblock and hoover up shutdown errors */ |
447 | error = xfs_sync_fsdata(mp, SYNC_WAIT); | 455 | error = xfs_sync_fsdata(mp, SYNC_WAIT); |
448 | 456 | ||
@@ -459,16 +467,18 @@ xfs_quiesce_fs( | |||
459 | { | 467 | { |
460 | int count = 0, pincount; | 468 | int count = 0, pincount; |
461 | 469 | ||
470 | xfs_reclaim_inodes(mp, 0); | ||
462 | xfs_flush_buftarg(mp->m_ddev_targp, 0); | 471 | xfs_flush_buftarg(mp->m_ddev_targp, 0); |
463 | xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); | ||
464 | 472 | ||
465 | /* | 473 | /* |
466 | * This loop must run at least twice. The first instance of the loop | 474 | * This loop must run at least twice. The first instance of the loop |
467 | * will flush most meta data but that will generate more meta data | 475 | * will flush most meta data but that will generate more meta data |
468 | * (typically directory updates). Which then must be flushed and | 476 | * (typically directory updates). Which then must be flushed and |
469 | * logged before we can write the unmount record. | 477 | * logged before we can write the unmount record. We also so sync |
478 | * reclaim of inodes to catch any that the above delwri flush skipped. | ||
470 | */ | 479 | */ |
471 | do { | 480 | do { |
481 | xfs_reclaim_inodes(mp, SYNC_WAIT); | ||
472 | xfs_sync_attr(mp, SYNC_WAIT); | 482 | xfs_sync_attr(mp, SYNC_WAIT); |
473 | pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); | 483 | pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); |
474 | if (!pincount) { | 484 | if (!pincount) { |
@@ -567,7 +577,7 @@ xfs_flush_inodes( | |||
567 | igrab(inode); | 577 | igrab(inode); |
568 | xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); | 578 | xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); |
569 | wait_for_completion(&completion); | 579 | wait_for_completion(&completion); |
570 | xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); | 580 | xfs_log_force(ip->i_mount, XFS_LOG_SYNC); |
571 | } | 581 | } |
572 | 582 | ||
573 | /* | 583 | /* |
@@ -583,8 +593,8 @@ xfs_sync_worker( | |||
583 | int error; | 593 | int error; |
584 | 594 | ||
585 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { | 595 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { |
586 | xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); | 596 | xfs_log_force(mp, 0); |
587 | xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); | 597 | xfs_reclaim_inodes(mp, 0); |
588 | /* dgc: errors ignored here */ | 598 | /* dgc: errors ignored here */ |
589 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); | 599 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); |
590 | error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); | 600 | error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); |
@@ -605,7 +615,8 @@ xfssyncd( | |||
605 | set_freezable(); | 615 | set_freezable(); |
606 | timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); | 616 | timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); |
607 | for (;;) { | 617 | for (;;) { |
608 | timeleft = schedule_timeout_interruptible(timeleft); | 618 | if (list_empty(&mp->m_sync_list)) |
619 | timeleft = schedule_timeout_interruptible(timeleft); | ||
609 | /* swsusp */ | 620 | /* swsusp */ |
610 | try_to_freeze(); | 621 | try_to_freeze(); |
611 | if (kthread_should_stop() && list_empty(&mp->m_sync_list)) | 622 | if (kthread_should_stop() && list_empty(&mp->m_sync_list)) |
@@ -625,8 +636,7 @@ xfssyncd( | |||
625 | list_add_tail(&mp->m_sync_work.w_list, | 636 | list_add_tail(&mp->m_sync_work.w_list, |
626 | &mp->m_sync_list); | 637 | &mp->m_sync_list); |
627 | } | 638 | } |
628 | list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) | 639 | list_splice_init(&mp->m_sync_list, &tmp); |
629 | list_move(&work->w_list, &tmp); | ||
630 | spin_unlock(&mp->m_sync_lock); | 640 | spin_unlock(&mp->m_sync_lock); |
631 | 641 | ||
632 | list_for_each_entry_safe(work, n, &tmp, w_list) { | 642 | list_for_each_entry_safe(work, n, &tmp, w_list) { |
@@ -663,67 +673,6 @@ xfs_syncd_stop( | |||
663 | kthread_stop(mp->m_sync_task); | 673 | kthread_stop(mp->m_sync_task); |
664 | } | 674 | } |
665 | 675 | ||
666 | int | ||
667 | xfs_reclaim_inode( | ||
668 | xfs_inode_t *ip, | ||
669 | int locked, | ||
670 | int sync_mode) | ||
671 | { | ||
672 | xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); | ||
673 | |||
674 | /* The hash lock here protects a thread in xfs_iget_core from | ||
675 | * racing with us on linking the inode back with a vnode. | ||
676 | * Once we have the XFS_IRECLAIM flag set it will not touch | ||
677 | * us. | ||
678 | */ | ||
679 | write_lock(&pag->pag_ici_lock); | ||
680 | spin_lock(&ip->i_flags_lock); | ||
681 | if (__xfs_iflags_test(ip, XFS_IRECLAIM) || | ||
682 | !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { | ||
683 | spin_unlock(&ip->i_flags_lock); | ||
684 | write_unlock(&pag->pag_ici_lock); | ||
685 | if (locked) { | ||
686 | xfs_ifunlock(ip); | ||
687 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
688 | } | ||
689 | return -EAGAIN; | ||
690 | } | ||
691 | __xfs_iflags_set(ip, XFS_IRECLAIM); | ||
692 | spin_unlock(&ip->i_flags_lock); | ||
693 | write_unlock(&pag->pag_ici_lock); | ||
694 | xfs_put_perag(ip->i_mount, pag); | ||
695 | |||
696 | /* | ||
697 | * If the inode is still dirty, then flush it out. If the inode | ||
698 | * is not in the AIL, then it will be OK to flush it delwri as | ||
699 | * long as xfs_iflush() does not keep any references to the inode. | ||
700 | * We leave that decision up to xfs_iflush() since it has the | ||
701 | * knowledge of whether it's OK to simply do a delwri flush of | ||
702 | * the inode or whether we need to wait until the inode is | ||
703 | * pulled from the AIL. | ||
704 | * We get the flush lock regardless, though, just to make sure | ||
705 | * we don't free it while it is being flushed. | ||
706 | */ | ||
707 | if (!locked) { | ||
708 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
709 | xfs_iflock(ip); | ||
710 | } | ||
711 | |||
712 | /* | ||
713 | * In the case of a forced shutdown we rely on xfs_iflush() to | ||
714 | * wait for the inode to be unpinned before returning an error. | ||
715 | */ | ||
716 | if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { | ||
717 | /* synchronize with xfs_iflush_done */ | ||
718 | xfs_iflock(ip); | ||
719 | xfs_ifunlock(ip); | ||
720 | } | ||
721 | |||
722 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
723 | xfs_ireclaim(ip); | ||
724 | return 0; | ||
725 | } | ||
726 | |||
727 | void | 676 | void |
728 | __xfs_inode_set_reclaim_tag( | 677 | __xfs_inode_set_reclaim_tag( |
729 | struct xfs_perag *pag, | 678 | struct xfs_perag *pag, |
@@ -732,6 +681,7 @@ __xfs_inode_set_reclaim_tag( | |||
732 | radix_tree_tag_set(&pag->pag_ici_root, | 681 | radix_tree_tag_set(&pag->pag_ici_root, |
733 | XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), | 682 | XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), |
734 | XFS_ICI_RECLAIM_TAG); | 683 | XFS_ICI_RECLAIM_TAG); |
684 | pag->pag_ici_reclaimable++; | ||
735 | } | 685 | } |
736 | 686 | ||
737 | /* | 687 | /* |
@@ -743,16 +693,17 @@ void | |||
743 | xfs_inode_set_reclaim_tag( | 693 | xfs_inode_set_reclaim_tag( |
744 | xfs_inode_t *ip) | 694 | xfs_inode_t *ip) |
745 | { | 695 | { |
746 | xfs_mount_t *mp = ip->i_mount; | 696 | struct xfs_mount *mp = ip->i_mount; |
747 | xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); | 697 | struct xfs_perag *pag; |
748 | 698 | ||
749 | read_lock(&pag->pag_ici_lock); | 699 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); |
700 | write_lock(&pag->pag_ici_lock); | ||
750 | spin_lock(&ip->i_flags_lock); | 701 | spin_lock(&ip->i_flags_lock); |
751 | __xfs_inode_set_reclaim_tag(pag, ip); | 702 | __xfs_inode_set_reclaim_tag(pag, ip); |
752 | __xfs_iflags_set(ip, XFS_IRECLAIMABLE); | 703 | __xfs_iflags_set(ip, XFS_IRECLAIMABLE); |
753 | spin_unlock(&ip->i_flags_lock); | 704 | spin_unlock(&ip->i_flags_lock); |
754 | read_unlock(&pag->pag_ici_lock); | 705 | write_unlock(&pag->pag_ici_lock); |
755 | xfs_put_perag(mp, pag); | 706 | xfs_perag_put(pag); |
756 | } | 707 | } |
757 | 708 | ||
758 | void | 709 | void |
@@ -763,22 +714,148 @@ __xfs_inode_clear_reclaim_tag( | |||
763 | { | 714 | { |
764 | radix_tree_tag_clear(&pag->pag_ici_root, | 715 | radix_tree_tag_clear(&pag->pag_ici_root, |
765 | XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); | 716 | XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); |
717 | pag->pag_ici_reclaimable--; | ||
766 | } | 718 | } |
767 | 719 | ||
720 | /* | ||
721 | * Inodes in different states need to be treated differently, and the return | ||
722 | * value of xfs_iflush is not sufficient to get this right. The following table | ||
723 | * lists the inode states and the reclaim actions necessary for non-blocking | ||
724 | * reclaim: | ||
725 | * | ||
726 | * | ||
727 | * inode state iflush ret required action | ||
728 | * --------------- ---------- --------------- | ||
729 | * bad - reclaim | ||
730 | * shutdown EIO unpin and reclaim | ||
731 | * clean, unpinned 0 reclaim | ||
732 | * stale, unpinned 0 reclaim | ||
733 | * clean, pinned(*) 0 requeue | ||
734 | * stale, pinned EAGAIN requeue | ||
735 | * dirty, delwri ok 0 requeue | ||
736 | * dirty, delwri blocked EAGAIN requeue | ||
737 | * dirty, sync flush 0 reclaim | ||
738 | * | ||
739 | * (*) dgc: I don't think the clean, pinned state is possible but it gets | ||
740 | * handled anyway given the order of checks implemented. | ||
741 | * | ||
742 | * As can be seen from the table, the return value of xfs_iflush() is not | ||
743 | * sufficient to correctly decide the reclaim action here. The checks in | ||
744 | * xfs_iflush() might look like duplicates, but they are not. | ||
745 | * | ||
746 | * Also, because we get the flush lock first, we know that any inode that has | ||
747 | * been flushed delwri has had the flush completed by the time we check that | ||
748 | * the inode is clean. The clean inode check needs to be done before flushing | ||
749 | * the inode delwri otherwise we would loop forever requeuing clean inodes as | ||
750 | * we cannot tell apart a successful delwri flush and a clean inode from the | ||
751 | * return value of xfs_iflush(). | ||
752 | * | ||
753 | * Note that because the inode is flushed delayed write by background | ||
754 | * writeback, the flush lock may already be held here and waiting on it can | ||
755 | * result in very long latencies. Hence for sync reclaims, where we wait on the | ||
756 | * flush lock, the caller should push out delayed write inodes first before | ||
757 | * trying to reclaim them to minimise the amount of time spent waiting. For | ||
758 | * background relaim, we just requeue the inode for the next pass. | ||
759 | * | ||
760 | * Hence the order of actions after gaining the locks should be: | ||
761 | * bad => reclaim | ||
762 | * shutdown => unpin and reclaim | ||
763 | * pinned, delwri => requeue | ||
764 | * pinned, sync => unpin | ||
765 | * stale => reclaim | ||
766 | * clean => reclaim | ||
767 | * dirty, delwri => flush and requeue | ||
768 | * dirty, sync => flush, wait and reclaim | ||
769 | */ | ||
768 | STATIC int | 770 | STATIC int |
769 | xfs_reclaim_inode_now( | 771 | xfs_reclaim_inode( |
770 | struct xfs_inode *ip, | 772 | struct xfs_inode *ip, |
771 | struct xfs_perag *pag, | 773 | struct xfs_perag *pag, |
772 | int flags) | 774 | int sync_mode) |
773 | { | 775 | { |
774 | /* ignore if already under reclaim */ | 776 | int error = 0; |
775 | if (xfs_iflags_test(ip, XFS_IRECLAIM)) { | 777 | |
776 | read_unlock(&pag->pag_ici_lock); | 778 | /* |
779 | * The radix tree lock here protects a thread in xfs_iget from racing | ||
780 | * with us starting reclaim on the inode. Once we have the | ||
781 | * XFS_IRECLAIM flag set it will not touch us. | ||
782 | */ | ||
783 | spin_lock(&ip->i_flags_lock); | ||
784 | ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); | ||
785 | if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { | ||
786 | /* ignore as it is already under reclaim */ | ||
787 | spin_unlock(&ip->i_flags_lock); | ||
788 | write_unlock(&pag->pag_ici_lock); | ||
777 | return 0; | 789 | return 0; |
778 | } | 790 | } |
779 | read_unlock(&pag->pag_ici_lock); | 791 | __xfs_iflags_set(ip, XFS_IRECLAIM); |
792 | spin_unlock(&ip->i_flags_lock); | ||
793 | write_unlock(&pag->pag_ici_lock); | ||
794 | |||
795 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
796 | if (!xfs_iflock_nowait(ip)) { | ||
797 | if (!(sync_mode & SYNC_WAIT)) | ||
798 | goto out; | ||
799 | xfs_iflock(ip); | ||
800 | } | ||
801 | |||
802 | if (is_bad_inode(VFS_I(ip))) | ||
803 | goto reclaim; | ||
804 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { | ||
805 | xfs_iunpin_wait(ip); | ||
806 | goto reclaim; | ||
807 | } | ||
808 | if (xfs_ipincount(ip)) { | ||
809 | if (!(sync_mode & SYNC_WAIT)) { | ||
810 | xfs_ifunlock(ip); | ||
811 | goto out; | ||
812 | } | ||
813 | xfs_iunpin_wait(ip); | ||
814 | } | ||
815 | if (xfs_iflags_test(ip, XFS_ISTALE)) | ||
816 | goto reclaim; | ||
817 | if (xfs_inode_clean(ip)) | ||
818 | goto reclaim; | ||
819 | |||
820 | /* Now we have an inode that needs flushing */ | ||
821 | error = xfs_iflush(ip, sync_mode); | ||
822 | if (sync_mode & SYNC_WAIT) { | ||
823 | xfs_iflock(ip); | ||
824 | goto reclaim; | ||
825 | } | ||
826 | |||
827 | /* | ||
828 | * When we have to flush an inode but don't have SYNC_WAIT set, we | ||
829 | * flush the inode out using a delwri buffer and wait for the next | ||
830 | * call into reclaim to find it in a clean state instead of waiting for | ||
831 | * it now. We also don't return errors here - if the error is transient | ||
832 | * then the next reclaim pass will flush the inode, and if the error | ||
833 | * is permanent then the next sync reclaim will reclaim the inode and | ||
834 | * pass on the error. | ||
835 | */ | ||
836 | if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { | ||
837 | xfs_fs_cmn_err(CE_WARN, ip->i_mount, | ||
838 | "inode 0x%llx background reclaim flush failed with %d", | ||
839 | (long long)ip->i_ino, error); | ||
840 | } | ||
841 | out: | ||
842 | xfs_iflags_clear(ip, XFS_IRECLAIM); | ||
843 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
844 | /* | ||
845 | * We could return EAGAIN here to make reclaim rescan the inode tree in | ||
846 | * a short while. However, this just burns CPU time scanning the tree | ||
847 | * waiting for IO to complete and xfssyncd never goes back to the idle | ||
848 | * state. Instead, return 0 to let the next scheduled background reclaim | ||
849 | * attempt to reclaim the inode again. | ||
850 | */ | ||
851 | return 0; | ||
852 | |||
853 | reclaim: | ||
854 | xfs_ifunlock(ip); | ||
855 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
856 | xfs_ireclaim(ip); | ||
857 | return error; | ||
780 | 858 | ||
781 | return xfs_reclaim_inode(ip, 0, flags); | ||
782 | } | 859 | } |
783 | 860 | ||
784 | int | 861 | int |
@@ -786,6 +863,94 @@ xfs_reclaim_inodes( | |||
786 | xfs_mount_t *mp, | 863 | xfs_mount_t *mp, |
787 | int mode) | 864 | int mode) |
788 | { | 865 | { |
789 | return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, | 866 | return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, |
790 | XFS_ICI_RECLAIM_TAG); | 867 | XFS_ICI_RECLAIM_TAG, 1, NULL); |
868 | } | ||
869 | |||
870 | /* | ||
871 | * Shrinker infrastructure. | ||
872 | * | ||
873 | * This is all far more complex than it needs to be. It adds a global list of | ||
874 | * mounts because the shrinkers can only call a global context. We need to make | ||
875 | * the shrinkers pass a context to avoid the need for global state. | ||
876 | */ | ||
877 | static LIST_HEAD(xfs_mount_list); | ||
878 | static struct rw_semaphore xfs_mount_list_lock; | ||
879 | |||
880 | static int | ||
881 | xfs_reclaim_inode_shrink( | ||
882 | int nr_to_scan, | ||
883 | gfp_t gfp_mask) | ||
884 | { | ||
885 | struct xfs_mount *mp; | ||
886 | struct xfs_perag *pag; | ||
887 | xfs_agnumber_t ag; | ||
888 | int reclaimable = 0; | ||
889 | |||
890 | if (nr_to_scan) { | ||
891 | if (!(gfp_mask & __GFP_FS)) | ||
892 | return -1; | ||
893 | |||
894 | down_read(&xfs_mount_list_lock); | ||
895 | list_for_each_entry(mp, &xfs_mount_list, m_mplist) { | ||
896 | xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, | ||
897 | XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); | ||
898 | if (nr_to_scan <= 0) | ||
899 | break; | ||
900 | } | ||
901 | up_read(&xfs_mount_list_lock); | ||
902 | } | ||
903 | |||
904 | down_read(&xfs_mount_list_lock); | ||
905 | list_for_each_entry(mp, &xfs_mount_list, m_mplist) { | ||
906 | for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { | ||
907 | |||
908 | pag = xfs_perag_get(mp, ag); | ||
909 | if (!pag->pag_ici_init) { | ||
910 | xfs_perag_put(pag); | ||
911 | continue; | ||
912 | } | ||
913 | reclaimable += pag->pag_ici_reclaimable; | ||
914 | xfs_perag_put(pag); | ||
915 | } | ||
916 | } | ||
917 | up_read(&xfs_mount_list_lock); | ||
918 | return reclaimable; | ||
919 | } | ||
920 | |||
921 | static struct shrinker xfs_inode_shrinker = { | ||
922 | .shrink = xfs_reclaim_inode_shrink, | ||
923 | .seeks = DEFAULT_SEEKS, | ||
924 | }; | ||
925 | |||
926 | void __init | ||
927 | xfs_inode_shrinker_init(void) | ||
928 | { | ||
929 | init_rwsem(&xfs_mount_list_lock); | ||
930 | register_shrinker(&xfs_inode_shrinker); | ||
931 | } | ||
932 | |||
933 | void | ||
934 | xfs_inode_shrinker_destroy(void) | ||
935 | { | ||
936 | ASSERT(list_empty(&xfs_mount_list)); | ||
937 | unregister_shrinker(&xfs_inode_shrinker); | ||
938 | } | ||
939 | |||
940 | void | ||
941 | xfs_inode_shrinker_register( | ||
942 | struct xfs_mount *mp) | ||
943 | { | ||
944 | down_write(&xfs_mount_list_lock); | ||
945 | list_add_tail(&mp->m_mplist, &xfs_mount_list); | ||
946 | up_write(&xfs_mount_list_lock); | ||
947 | } | ||
948 | |||
949 | void | ||
950 | xfs_inode_shrinker_unregister( | ||
951 | struct xfs_mount *mp) | ||
952 | { | ||
953 | down_write(&xfs_mount_list_lock); | ||
954 | list_del(&mp->m_mplist); | ||
955 | up_write(&xfs_mount_list_lock); | ||
791 | } | 956 | } |