diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
| -rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 329 |
1 files changed, 201 insertions, 128 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 6fed97a8cd3e..a9f6d20aff41 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c | |||
| @@ -65,7 +65,6 @@ xfs_inode_ag_lookup( | |||
| 65 | * as the tree is sparse and a gang lookup walks to find | 65 | * as the tree is sparse and a gang lookup walks to find |
| 66 | * the number of objects requested. | 66 | * the number of objects requested. |
| 67 | */ | 67 | */ |
| 68 | read_lock(&pag->pag_ici_lock); | ||
| 69 | if (tag == XFS_ICI_NO_TAG) { | 68 | if (tag == XFS_ICI_NO_TAG) { |
| 70 | nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, | 69 | nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, |
| 71 | (void **)&ip, *first_index, 1); | 70 | (void **)&ip, *first_index, 1); |
| @@ -74,7 +73,7 @@ xfs_inode_ag_lookup( | |||
| 74 | (void **)&ip, *first_index, 1, tag); | 73 | (void **)&ip, *first_index, 1, tag); |
| 75 | } | 74 | } |
| 76 | if (!nr_found) | 75 | if (!nr_found) |
| 77 | goto unlock; | 76 | return NULL; |
| 78 | 77 | ||
| 79 | /* | 78 | /* |
| 80 | * Update the index for the next lookup. Catch overflows | 79 | * Update the index for the next lookup. Catch overflows |
| @@ -84,25 +83,20 @@ xfs_inode_ag_lookup( | |||
| 84 | */ | 83 | */ |
| 85 | *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); | 84 | *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); |
| 86 | if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) | 85 | if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) |
| 87 | goto unlock; | 86 | return NULL; |
| 88 | |||
| 89 | return ip; | 87 | return ip; |
| 90 | |||
| 91 | unlock: | ||
| 92 | read_unlock(&pag->pag_ici_lock); | ||
| 93 | return NULL; | ||
| 94 | } | 88 | } |
| 95 | 89 | ||
| 96 | STATIC int | 90 | STATIC int |
| 97 | xfs_inode_ag_walk( | 91 | xfs_inode_ag_walk( |
| 98 | struct xfs_mount *mp, | 92 | struct xfs_mount *mp, |
| 99 | xfs_agnumber_t ag, | 93 | struct xfs_perag *pag, |
| 100 | int (*execute)(struct xfs_inode *ip, | 94 | int (*execute)(struct xfs_inode *ip, |
| 101 | struct xfs_perag *pag, int flags), | 95 | struct xfs_perag *pag, int flags), |
| 102 | int flags, | 96 | int flags, |
| 103 | int tag) | 97 | int tag, |
| 98 | int exclusive) | ||
| 104 | { | 99 | { |
| 105 | struct xfs_perag *pag = &mp->m_perag[ag]; | ||
| 106 | uint32_t first_index; | 100 | uint32_t first_index; |
| 107 | int last_error = 0; | 101 | int last_error = 0; |
| 108 | int skipped; | 102 | int skipped; |
| @@ -114,10 +108,20 @@ restart: | |||
| 114 | int error = 0; | 108 | int error = 0; |
| 115 | xfs_inode_t *ip; | 109 | xfs_inode_t *ip; |
| 116 | 110 | ||
| 111 | if (exclusive) | ||
| 112 | write_lock(&pag->pag_ici_lock); | ||
| 113 | else | ||
| 114 | read_lock(&pag->pag_ici_lock); | ||
| 117 | ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); | 115 | ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); |
| 118 | if (!ip) | 116 | if (!ip) { |
| 117 | if (exclusive) | ||
| 118 | write_unlock(&pag->pag_ici_lock); | ||
| 119 | else | ||
| 120 | read_unlock(&pag->pag_ici_lock); | ||
| 119 | break; | 121 | break; |
| 122 | } | ||
| 120 | 123 | ||
| 124 | /* execute releases pag->pag_ici_lock */ | ||
| 121 | error = execute(ip, pag, flags); | 125 | error = execute(ip, pag, flags); |
| 122 | if (error == EAGAIN) { | 126 | if (error == EAGAIN) { |
| 123 | skipped++; | 127 | skipped++; |
| @@ -125,9 +129,8 @@ restart: | |||
| 125 | } | 129 | } |
| 126 | if (error) | 130 | if (error) |
| 127 | last_error = error; | 131 | last_error = error; |
| 128 | /* | 132 | |
| 129 | * bail out if the filesystem is corrupted. | 133 | /* bail out if the filesystem is corrupted. */ |
| 130 | */ | ||
| 131 | if (error == EFSCORRUPTED) | 134 | if (error == EFSCORRUPTED) |
| 132 | break; | 135 | break; |
| 133 | 136 | ||
| @@ -137,8 +140,6 @@ restart: | |||
| 137 | delay(1); | 140 | delay(1); |
| 138 | goto restart; | 141 | goto restart; |
| 139 | } | 142 | } |
| 140 | |||
| 141 | xfs_put_perag(mp, pag); | ||
| 142 | return last_error; | 143 | return last_error; |
| 143 | } | 144 | } |
| 144 | 145 | ||
| @@ -148,16 +149,24 @@ xfs_inode_ag_iterator( | |||
| 148 | int (*execute)(struct xfs_inode *ip, | 149 | int (*execute)(struct xfs_inode *ip, |
| 149 | struct xfs_perag *pag, int flags), | 150 | struct xfs_perag *pag, int flags), |
| 150 | int flags, | 151 | int flags, |
| 151 | int tag) | 152 | int tag, |
| 153 | int exclusive) | ||
| 152 | { | 154 | { |
| 153 | int error = 0; | 155 | int error = 0; |
| 154 | int last_error = 0; | 156 | int last_error = 0; |
| 155 | xfs_agnumber_t ag; | 157 | xfs_agnumber_t ag; |
| 156 | 158 | ||
| 157 | for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { | 159 | for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { |
| 158 | if (!mp->m_perag[ag].pag_ici_init) | 160 | struct xfs_perag *pag; |
| 161 | |||
| 162 | pag = xfs_perag_get(mp, ag); | ||
| 163 | if (!pag->pag_ici_init) { | ||
| 164 | xfs_perag_put(pag); | ||
| 159 | continue; | 165 | continue; |
| 160 | error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); | 166 | } |
| 167 | error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, | ||
| 168 | exclusive); | ||
| 169 | xfs_perag_put(pag); | ||
| 161 | if (error) { | 170 | if (error) { |
| 162 | last_error = error; | 171 | last_error = error; |
| 163 | if (error == EFSCORRUPTED) | 172 | if (error == EFSCORRUPTED) |
| @@ -174,30 +183,31 @@ xfs_sync_inode_valid( | |||
| 174 | struct xfs_perag *pag) | 183 | struct xfs_perag *pag) |
| 175 | { | 184 | { |
| 176 | struct inode *inode = VFS_I(ip); | 185 | struct inode *inode = VFS_I(ip); |
| 186 | int error = EFSCORRUPTED; | ||
| 177 | 187 | ||
| 178 | /* nothing to sync during shutdown */ | 188 | /* nothing to sync during shutdown */ |
| 179 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { | 189 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) |
| 180 | read_unlock(&pag->pag_ici_lock); | 190 | goto out_unlock; |
| 181 | return EFSCORRUPTED; | ||
| 182 | } | ||
| 183 | 191 | ||
| 184 | /* | 192 | /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ |
| 185 | * If we can't get a reference on the inode, it must be in reclaim. | 193 | error = ENOENT; |
| 186 | * Leave it for the reclaim code to flush. Also avoid inodes that | 194 | if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) |
| 187 | * haven't been fully initialised. | 195 | goto out_unlock; |
| 188 | */ | 196 | |
| 189 | if (!igrab(inode)) { | 197 | /* If we can't grab the inode, it must on it's way to reclaim. */ |
| 190 | read_unlock(&pag->pag_ici_lock); | 198 | if (!igrab(inode)) |
| 191 | return ENOENT; | 199 | goto out_unlock; |
| 192 | } | ||
| 193 | read_unlock(&pag->pag_ici_lock); | ||
| 194 | 200 | ||
| 195 | if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { | 201 | if (is_bad_inode(inode)) { |
| 196 | IRELE(ip); | 202 | IRELE(ip); |
| 197 | return ENOENT; | 203 | goto out_unlock; |
| 198 | } | 204 | } |
| 199 | 205 | ||
| 200 | return 0; | 206 | /* inode is valid */ |
| 207 | error = 0; | ||
| 208 | out_unlock: | ||
| 209 | read_unlock(&pag->pag_ici_lock); | ||
| 210 | return error; | ||
| 201 | } | 211 | } |
| 202 | 212 | ||
| 203 | STATIC int | 213 | STATIC int |
| @@ -224,7 +234,7 @@ xfs_sync_inode_data( | |||
| 224 | } | 234 | } |
| 225 | 235 | ||
| 226 | error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? | 236 | error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? |
| 227 | 0 : XFS_B_ASYNC, FI_NONE); | 237 | 0 : XBF_ASYNC, FI_NONE); |
| 228 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); | 238 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); |
| 229 | 239 | ||
| 230 | out_wait: | 240 | out_wait: |
| @@ -260,8 +270,7 @@ xfs_sync_inode_attr( | |||
| 260 | goto out_unlock; | 270 | goto out_unlock; |
| 261 | } | 271 | } |
| 262 | 272 | ||
| 263 | error = xfs_iflush(ip, (flags & SYNC_WAIT) ? | 273 | error = xfs_iflush(ip, flags); |
| 264 | XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI); | ||
| 265 | 274 | ||
| 266 | out_unlock: | 275 | out_unlock: |
| 267 | xfs_iunlock(ip, XFS_ILOCK_SHARED); | 276 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
| @@ -282,14 +291,11 @@ xfs_sync_data( | |||
| 282 | ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); | 291 | ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); |
| 283 | 292 | ||
| 284 | error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, | 293 | error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, |
| 285 | XFS_ICI_NO_TAG); | 294 | XFS_ICI_NO_TAG, 0); |
| 286 | if (error) | 295 | if (error) |
| 287 | return XFS_ERROR(error); | 296 | return XFS_ERROR(error); |
| 288 | 297 | ||
| 289 | xfs_log_force(mp, 0, | 298 | xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); |
| 290 | (flags & SYNC_WAIT) ? | ||
| 291 | XFS_LOG_FORCE | XFS_LOG_SYNC : | ||
| 292 | XFS_LOG_FORCE); | ||
| 293 | return 0; | 299 | return 0; |
| 294 | } | 300 | } |
| 295 | 301 | ||
| @@ -304,7 +310,7 @@ xfs_sync_attr( | |||
| 304 | ASSERT((flags & ~SYNC_WAIT) == 0); | 310 | ASSERT((flags & ~SYNC_WAIT) == 0); |
| 305 | 311 | ||
| 306 | return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, | 312 | return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, |
| 307 | XFS_ICI_NO_TAG); | 313 | XFS_ICI_NO_TAG, 0); |
| 308 | } | 314 | } |
| 309 | 315 | ||
| 310 | STATIC int | 316 | STATIC int |
| @@ -315,10 +321,6 @@ xfs_commit_dummy_trans( | |||
| 315 | struct xfs_inode *ip = mp->m_rootip; | 321 | struct xfs_inode *ip = mp->m_rootip; |
| 316 | struct xfs_trans *tp; | 322 | struct xfs_trans *tp; |
| 317 | int error; | 323 | int error; |
| 318 | int log_flags = XFS_LOG_FORCE; | ||
| 319 | |||
| 320 | if (flags & SYNC_WAIT) | ||
| 321 | log_flags |= XFS_LOG_SYNC; | ||
| 322 | 324 | ||
| 323 | /* | 325 | /* |
| 324 | * Put a dummy transaction in the log to tell recovery | 326 | * Put a dummy transaction in the log to tell recovery |
| @@ -340,11 +342,11 @@ xfs_commit_dummy_trans( | |||
| 340 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 342 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
| 341 | 343 | ||
| 342 | /* the log force ensures this transaction is pushed to disk */ | 344 | /* the log force ensures this transaction is pushed to disk */ |
| 343 | xfs_log_force(mp, 0, log_flags); | 345 | xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); |
| 344 | return error; | 346 | return error; |
| 345 | } | 347 | } |
| 346 | 348 | ||
| 347 | int | 349 | STATIC int |
| 348 | xfs_sync_fsdata( | 350 | xfs_sync_fsdata( |
| 349 | struct xfs_mount *mp, | 351 | struct xfs_mount *mp, |
| 350 | int flags) | 352 | int flags) |
| @@ -360,7 +362,7 @@ xfs_sync_fsdata( | |||
| 360 | if (flags & SYNC_TRYLOCK) { | 362 | if (flags & SYNC_TRYLOCK) { |
| 361 | ASSERT(!(flags & SYNC_WAIT)); | 363 | ASSERT(!(flags & SYNC_WAIT)); |
| 362 | 364 | ||
| 363 | bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); | 365 | bp = xfs_getsb(mp, XBF_TRYLOCK); |
| 364 | if (!bp) | 366 | if (!bp) |
| 365 | goto out; | 367 | goto out; |
| 366 | 368 | ||
| @@ -380,7 +382,7 @@ xfs_sync_fsdata( | |||
| 380 | * become pinned in between there and here. | 382 | * become pinned in between there and here. |
| 381 | */ | 383 | */ |
| 382 | if (XFS_BUF_ISPINNED(bp)) | 384 | if (XFS_BUF_ISPINNED(bp)) |
| 383 | xfs_log_force(mp, 0, XFS_LOG_FORCE); | 385 | xfs_log_force(mp, 0); |
| 384 | } | 386 | } |
| 385 | 387 | ||
| 386 | 388 | ||
| @@ -441,9 +443,6 @@ xfs_quiesce_data( | |||
| 441 | xfs_sync_data(mp, SYNC_WAIT); | 443 | xfs_sync_data(mp, SYNC_WAIT); |
| 442 | xfs_qm_sync(mp, SYNC_WAIT); | 444 | xfs_qm_sync(mp, SYNC_WAIT); |
| 443 | 445 | ||
| 444 | /* drop inode references pinned by filestreams */ | ||
| 445 | xfs_filestream_flush(mp); | ||
| 446 | |||
| 447 | /* write superblock and hoover up shutdown errors */ | 446 | /* write superblock and hoover up shutdown errors */ |
| 448 | error = xfs_sync_fsdata(mp, SYNC_WAIT); | 447 | error = xfs_sync_fsdata(mp, SYNC_WAIT); |
| 449 | 448 | ||
| @@ -460,16 +459,18 @@ xfs_quiesce_fs( | |||
| 460 | { | 459 | { |
| 461 | int count = 0, pincount; | 460 | int count = 0, pincount; |
| 462 | 461 | ||
| 462 | xfs_reclaim_inodes(mp, 0); | ||
| 463 | xfs_flush_buftarg(mp->m_ddev_targp, 0); | 463 | xfs_flush_buftarg(mp->m_ddev_targp, 0); |
| 464 | xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); | ||
| 465 | 464 | ||
| 466 | /* | 465 | /* |
| 467 | * This loop must run at least twice. The first instance of the loop | 466 | * This loop must run at least twice. The first instance of the loop |
| 468 | * will flush most meta data but that will generate more meta data | 467 | * will flush most meta data but that will generate more meta data |
| 469 | * (typically directory updates). Which then must be flushed and | 468 | * (typically directory updates). Which then must be flushed and |
| 470 | * logged before we can write the unmount record. | 469 | * logged before we can write the unmount record. We also so sync |
| 470 | * reclaim of inodes to catch any that the above delwri flush skipped. | ||
| 471 | */ | 471 | */ |
| 472 | do { | 472 | do { |
| 473 | xfs_reclaim_inodes(mp, SYNC_WAIT); | ||
| 473 | xfs_sync_attr(mp, SYNC_WAIT); | 474 | xfs_sync_attr(mp, SYNC_WAIT); |
| 474 | pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); | 475 | pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); |
| 475 | if (!pincount) { | 476 | if (!pincount) { |
| @@ -568,7 +569,7 @@ xfs_flush_inodes( | |||
| 568 | igrab(inode); | 569 | igrab(inode); |
| 569 | xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); | 570 | xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); |
| 570 | wait_for_completion(&completion); | 571 | wait_for_completion(&completion); |
| 571 | xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); | 572 | xfs_log_force(ip->i_mount, XFS_LOG_SYNC); |
| 572 | } | 573 | } |
| 573 | 574 | ||
| 574 | /* | 575 | /* |
| @@ -584,8 +585,8 @@ xfs_sync_worker( | |||
| 584 | int error; | 585 | int error; |
| 585 | 586 | ||
| 586 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { | 587 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { |
| 587 | xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); | 588 | xfs_log_force(mp, 0); |
| 588 | xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); | 589 | xfs_reclaim_inodes(mp, 0); |
| 589 | /* dgc: errors ignored here */ | 590 | /* dgc: errors ignored here */ |
| 590 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); | 591 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); |
| 591 | error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); | 592 | error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); |
| @@ -664,60 +665,6 @@ xfs_syncd_stop( | |||
| 664 | kthread_stop(mp->m_sync_task); | 665 | kthread_stop(mp->m_sync_task); |
| 665 | } | 666 | } |
| 666 | 667 | ||
| 667 | STATIC int | ||
| 668 | xfs_reclaim_inode( | ||
| 669 | xfs_inode_t *ip, | ||
| 670 | int sync_mode) | ||
| 671 | { | ||
| 672 | xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); | ||
| 673 | |||
| 674 | /* The hash lock here protects a thread in xfs_iget_core from | ||
| 675 | * racing with us on linking the inode back with a vnode. | ||
| 676 | * Once we have the XFS_IRECLAIM flag set it will not touch | ||
| 677 | * us. | ||
| 678 | */ | ||
| 679 | write_lock(&pag->pag_ici_lock); | ||
| 680 | spin_lock(&ip->i_flags_lock); | ||
| 681 | if (__xfs_iflags_test(ip, XFS_IRECLAIM) || | ||
| 682 | !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { | ||
| 683 | spin_unlock(&ip->i_flags_lock); | ||
| 684 | write_unlock(&pag->pag_ici_lock); | ||
| 685 | return -EAGAIN; | ||
| 686 | } | ||
| 687 | __xfs_iflags_set(ip, XFS_IRECLAIM); | ||
| 688 | spin_unlock(&ip->i_flags_lock); | ||
| 689 | write_unlock(&pag->pag_ici_lock); | ||
| 690 | xfs_put_perag(ip->i_mount, pag); | ||
| 691 | |||
| 692 | /* | ||
| 693 | * If the inode is still dirty, then flush it out. If the inode | ||
| 694 | * is not in the AIL, then it will be OK to flush it delwri as | ||
| 695 | * long as xfs_iflush() does not keep any references to the inode. | ||
| 696 | * We leave that decision up to xfs_iflush() since it has the | ||
| 697 | * knowledge of whether it's OK to simply do a delwri flush of | ||
| 698 | * the inode or whether we need to wait until the inode is | ||
| 699 | * pulled from the AIL. | ||
| 700 | * We get the flush lock regardless, though, just to make sure | ||
| 701 | * we don't free it while it is being flushed. | ||
| 702 | */ | ||
| 703 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
| 704 | xfs_iflock(ip); | ||
| 705 | |||
| 706 | /* | ||
| 707 | * In the case of a forced shutdown we rely on xfs_iflush() to | ||
| 708 | * wait for the inode to be unpinned before returning an error. | ||
| 709 | */ | ||
| 710 | if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { | ||
| 711 | /* synchronize with xfs_iflush_done */ | ||
| 712 | xfs_iflock(ip); | ||
| 713 | xfs_ifunlock(ip); | ||
| 714 | } | ||
| 715 | |||
| 716 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
| 717 | xfs_ireclaim(ip); | ||
| 718 | return 0; | ||
| 719 | } | ||
| 720 | |||
| 721 | void | 668 | void |
| 722 | __xfs_inode_set_reclaim_tag( | 669 | __xfs_inode_set_reclaim_tag( |
| 723 | struct xfs_perag *pag, | 670 | struct xfs_perag *pag, |
| @@ -737,16 +684,17 @@ void | |||
| 737 | xfs_inode_set_reclaim_tag( | 684 | xfs_inode_set_reclaim_tag( |
| 738 | xfs_inode_t *ip) | 685 | xfs_inode_t *ip) |
| 739 | { | 686 | { |
| 740 | xfs_mount_t *mp = ip->i_mount; | 687 | struct xfs_mount *mp = ip->i_mount; |
| 741 | xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); | 688 | struct xfs_perag *pag; |
| 742 | 689 | ||
| 690 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); | ||
| 743 | read_lock(&pag->pag_ici_lock); | 691 | read_lock(&pag->pag_ici_lock); |
| 744 | spin_lock(&ip->i_flags_lock); | 692 | spin_lock(&ip->i_flags_lock); |
| 745 | __xfs_inode_set_reclaim_tag(pag, ip); | 693 | __xfs_inode_set_reclaim_tag(pag, ip); |
| 746 | __xfs_iflags_set(ip, XFS_IRECLAIMABLE); | 694 | __xfs_iflags_set(ip, XFS_IRECLAIMABLE); |
| 747 | spin_unlock(&ip->i_flags_lock); | 695 | spin_unlock(&ip->i_flags_lock); |
| 748 | read_unlock(&pag->pag_ici_lock); | 696 | read_unlock(&pag->pag_ici_lock); |
| 749 | xfs_put_perag(mp, pag); | 697 | xfs_perag_put(pag); |
| 750 | } | 698 | } |
| 751 | 699 | ||
| 752 | void | 700 | void |
| @@ -759,20 +707,145 @@ __xfs_inode_clear_reclaim_tag( | |||
| 759 | XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); | 707 | XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); |
| 760 | } | 708 | } |
| 761 | 709 | ||
| 710 | /* | ||
| 711 | * Inodes in different states need to be treated differently, and the return | ||
| 712 | * value of xfs_iflush is not sufficient to get this right. The following table | ||
| 713 | * lists the inode states and the reclaim actions necessary for non-blocking | ||
| 714 | * reclaim: | ||
| 715 | * | ||
| 716 | * | ||
| 717 | * inode state iflush ret required action | ||
| 718 | * --------------- ---------- --------------- | ||
| 719 | * bad - reclaim | ||
| 720 | * shutdown EIO unpin and reclaim | ||
| 721 | * clean, unpinned 0 reclaim | ||
| 722 | * stale, unpinned 0 reclaim | ||
| 723 | * clean, pinned(*) 0 requeue | ||
| 724 | * stale, pinned EAGAIN requeue | ||
| 725 | * dirty, delwri ok 0 requeue | ||
| 726 | * dirty, delwri blocked EAGAIN requeue | ||
| 727 | * dirty, sync flush 0 reclaim | ||
| 728 | * | ||
| 729 | * (*) dgc: I don't think the clean, pinned state is possible but it gets | ||
| 730 | * handled anyway given the order of checks implemented. | ||
| 731 | * | ||
| 732 | * As can be seen from the table, the return value of xfs_iflush() is not | ||
| 733 | * sufficient to correctly decide the reclaim action here. The checks in | ||
| 734 | * xfs_iflush() might look like duplicates, but they are not. | ||
| 735 | * | ||
| 736 | * Also, because we get the flush lock first, we know that any inode that has | ||
| 737 | * been flushed delwri has had the flush completed by the time we check that | ||
| 738 | * the inode is clean. The clean inode check needs to be done before flushing | ||
| 739 | * the inode delwri otherwise we would loop forever requeuing clean inodes as | ||
| 740 | * we cannot tell apart a successful delwri flush and a clean inode from the | ||
| 741 | * return value of xfs_iflush(). | ||
| 742 | * | ||
| 743 | * Note that because the inode is flushed delayed write by background | ||
| 744 | * writeback, the flush lock may already be held here and waiting on it can | ||
| 745 | * result in very long latencies. Hence for sync reclaims, where we wait on the | ||
| 746 | * flush lock, the caller should push out delayed write inodes first before | ||
| 747 | * trying to reclaim them to minimise the amount of time spent waiting. For | ||
| 748 | * background relaim, we just requeue the inode for the next pass. | ||
| 749 | * | ||
| 750 | * Hence the order of actions after gaining the locks should be: | ||
| 751 | * bad => reclaim | ||
| 752 | * shutdown => unpin and reclaim | ||
| 753 | * pinned, delwri => requeue | ||
| 754 | * pinned, sync => unpin | ||
| 755 | * stale => reclaim | ||
| 756 | * clean => reclaim | ||
| 757 | * dirty, delwri => flush and requeue | ||
| 758 | * dirty, sync => flush, wait and reclaim | ||
| 759 | */ | ||
| 762 | STATIC int | 760 | STATIC int |
| 763 | xfs_reclaim_inode_now( | 761 | xfs_reclaim_inode( |
| 764 | struct xfs_inode *ip, | 762 | struct xfs_inode *ip, |
| 765 | struct xfs_perag *pag, | 763 | struct xfs_perag *pag, |
| 766 | int flags) | 764 | int sync_mode) |
| 767 | { | 765 | { |
| 768 | /* ignore if already under reclaim */ | 766 | int error = 0; |
| 769 | if (xfs_iflags_test(ip, XFS_IRECLAIM)) { | 767 | |
| 770 | read_unlock(&pag->pag_ici_lock); | 768 | /* |
| 769 | * The radix tree lock here protects a thread in xfs_iget from racing | ||
| 770 | * with us starting reclaim on the inode. Once we have the | ||
| 771 | * XFS_IRECLAIM flag set it will not touch us. | ||
| 772 | */ | ||
| 773 | spin_lock(&ip->i_flags_lock); | ||
| 774 | ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); | ||
| 775 | if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { | ||
| 776 | /* ignore as it is already under reclaim */ | ||
| 777 | spin_unlock(&ip->i_flags_lock); | ||
| 778 | write_unlock(&pag->pag_ici_lock); | ||
| 771 | return 0; | 779 | return 0; |
| 772 | } | 780 | } |
| 773 | read_unlock(&pag->pag_ici_lock); | 781 | __xfs_iflags_set(ip, XFS_IRECLAIM); |
| 782 | spin_unlock(&ip->i_flags_lock); | ||
| 783 | write_unlock(&pag->pag_ici_lock); | ||
| 784 | |||
| 785 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
| 786 | if (!xfs_iflock_nowait(ip)) { | ||
| 787 | if (!(sync_mode & SYNC_WAIT)) | ||
| 788 | goto out; | ||
| 789 | xfs_iflock(ip); | ||
| 790 | } | ||
| 791 | |||
| 792 | if (is_bad_inode(VFS_I(ip))) | ||
| 793 | goto reclaim; | ||
| 794 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { | ||
| 795 | xfs_iunpin_wait(ip); | ||
| 796 | goto reclaim; | ||
| 797 | } | ||
| 798 | if (xfs_ipincount(ip)) { | ||
| 799 | if (!(sync_mode & SYNC_WAIT)) { | ||
| 800 | xfs_ifunlock(ip); | ||
| 801 | goto out; | ||
| 802 | } | ||
| 803 | xfs_iunpin_wait(ip); | ||
| 804 | } | ||
| 805 | if (xfs_iflags_test(ip, XFS_ISTALE)) | ||
| 806 | goto reclaim; | ||
| 807 | if (xfs_inode_clean(ip)) | ||
| 808 | goto reclaim; | ||
| 809 | |||
| 810 | /* Now we have an inode that needs flushing */ | ||
| 811 | error = xfs_iflush(ip, sync_mode); | ||
| 812 | if (sync_mode & SYNC_WAIT) { | ||
| 813 | xfs_iflock(ip); | ||
| 814 | goto reclaim; | ||
| 815 | } | ||
| 816 | |||
| 817 | /* | ||
| 818 | * When we have to flush an inode but don't have SYNC_WAIT set, we | ||
| 819 | * flush the inode out using a delwri buffer and wait for the next | ||
| 820 | * call into reclaim to find it in a clean state instead of waiting for | ||
| 821 | * it now. We also don't return errors here - if the error is transient | ||
| 822 | * then the next reclaim pass will flush the inode, and if the error | ||
| 823 | * is permanent then the next sync reclaim will relcaim the inode and | ||
| 824 | * pass on the error. | ||
| 825 | */ | ||
| 826 | if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { | ||
| 827 | xfs_fs_cmn_err(CE_WARN, ip->i_mount, | ||
| 828 | "inode 0x%llx background reclaim flush failed with %d", | ||
| 829 | (long long)ip->i_ino, error); | ||
| 830 | } | ||
| 831 | out: | ||
| 832 | xfs_iflags_clear(ip, XFS_IRECLAIM); | ||
| 833 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
| 834 | /* | ||
| 835 | * We could return EAGAIN here to make reclaim rescan the inode tree in | ||
| 836 | * a short while. However, this just burns CPU time scanning the tree | ||
| 837 | * waiting for IO to complete and xfssyncd never goes back to the idle | ||
| 838 | * state. Instead, return 0 to let the next scheduled background reclaim | ||
| 839 | * attempt to reclaim the inode again. | ||
| 840 | */ | ||
| 841 | return 0; | ||
| 842 | |||
| 843 | reclaim: | ||
| 844 | xfs_ifunlock(ip); | ||
| 845 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
| 846 | xfs_ireclaim(ip); | ||
| 847 | return error; | ||
| 774 | 848 | ||
| 775 | return xfs_reclaim_inode(ip, flags); | ||
| 776 | } | 849 | } |
| 777 | 850 | ||
| 778 | int | 851 | int |
| @@ -780,6 +853,6 @@ xfs_reclaim_inodes( | |||
| 780 | xfs_mount_t *mp, | 853 | xfs_mount_t *mp, |
| 781 | int mode) | 854 | int mode) |
| 782 | { | 855 | { |
| 783 | return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, | 856 | return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, |
| 784 | XFS_ICI_RECLAIM_TAG); | 857 | XFS_ICI_RECLAIM_TAG, 1); |
| 785 | } | 858 | } |
