diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_sync.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 186 |
1 files changed, 135 insertions, 51 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 1f5e4bb5e970..a9f6d20aff41 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c | |||
@@ -90,14 +90,13 @@ xfs_inode_ag_lookup( | |||
90 | STATIC int | 90 | STATIC int |
91 | xfs_inode_ag_walk( | 91 | xfs_inode_ag_walk( |
92 | struct xfs_mount *mp, | 92 | struct xfs_mount *mp, |
93 | xfs_agnumber_t ag, | 93 | struct xfs_perag *pag, |
94 | int (*execute)(struct xfs_inode *ip, | 94 | int (*execute)(struct xfs_inode *ip, |
95 | struct xfs_perag *pag, int flags), | 95 | struct xfs_perag *pag, int flags), |
96 | int flags, | 96 | int flags, |
97 | int tag, | 97 | int tag, |
98 | int exclusive) | 98 | int exclusive) |
99 | { | 99 | { |
100 | struct xfs_perag *pag = &mp->m_perag[ag]; | ||
101 | uint32_t first_index; | 100 | uint32_t first_index; |
102 | int last_error = 0; | 101 | int last_error = 0; |
103 | int skipped; | 102 | int skipped; |
@@ -141,8 +140,6 @@ restart: | |||
141 | delay(1); | 140 | delay(1); |
142 | goto restart; | 141 | goto restart; |
143 | } | 142 | } |
144 | |||
145 | xfs_put_perag(mp, pag); | ||
146 | return last_error; | 143 | return last_error; |
147 | } | 144 | } |
148 | 145 | ||
@@ -160,10 +157,16 @@ xfs_inode_ag_iterator( | |||
160 | xfs_agnumber_t ag; | 157 | xfs_agnumber_t ag; |
161 | 158 | ||
162 | for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { | 159 | for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { |
163 | if (!mp->m_perag[ag].pag_ici_init) | 160 | struct xfs_perag *pag; |
161 | |||
162 | pag = xfs_perag_get(mp, ag); | ||
163 | if (!pag->pag_ici_init) { | ||
164 | xfs_perag_put(pag); | ||
164 | continue; | 165 | continue; |
165 | error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, | 166 | } |
167 | error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, | ||
166 | exclusive); | 168 | exclusive); |
169 | xfs_perag_put(pag); | ||
167 | if (error) { | 170 | if (error) { |
168 | last_error = error; | 171 | last_error = error; |
169 | if (error == EFSCORRUPTED) | 172 | if (error == EFSCORRUPTED) |
@@ -231,7 +234,7 @@ xfs_sync_inode_data( | |||
231 | } | 234 | } |
232 | 235 | ||
233 | error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? | 236 | error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? |
234 | 0 : XFS_B_ASYNC, FI_NONE); | 237 | 0 : XBF_ASYNC, FI_NONE); |
235 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); | 238 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); |
236 | 239 | ||
237 | out_wait: | 240 | out_wait: |
@@ -267,8 +270,7 @@ xfs_sync_inode_attr( | |||
267 | goto out_unlock; | 270 | goto out_unlock; |
268 | } | 271 | } |
269 | 272 | ||
270 | error = xfs_iflush(ip, (flags & SYNC_WAIT) ? | 273 | error = xfs_iflush(ip, flags); |
271 | XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI); | ||
272 | 274 | ||
273 | out_unlock: | 275 | out_unlock: |
274 | xfs_iunlock(ip, XFS_ILOCK_SHARED); | 276 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
@@ -293,10 +295,7 @@ xfs_sync_data( | |||
293 | if (error) | 295 | if (error) |
294 | return XFS_ERROR(error); | 296 | return XFS_ERROR(error); |
295 | 297 | ||
296 | xfs_log_force(mp, 0, | 298 | xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); |
297 | (flags & SYNC_WAIT) ? | ||
298 | XFS_LOG_FORCE | XFS_LOG_SYNC : | ||
299 | XFS_LOG_FORCE); | ||
300 | return 0; | 299 | return 0; |
301 | } | 300 | } |
302 | 301 | ||
@@ -322,10 +321,6 @@ xfs_commit_dummy_trans( | |||
322 | struct xfs_inode *ip = mp->m_rootip; | 321 | struct xfs_inode *ip = mp->m_rootip; |
323 | struct xfs_trans *tp; | 322 | struct xfs_trans *tp; |
324 | int error; | 323 | int error; |
325 | int log_flags = XFS_LOG_FORCE; | ||
326 | |||
327 | if (flags & SYNC_WAIT) | ||
328 | log_flags |= XFS_LOG_SYNC; | ||
329 | 324 | ||
330 | /* | 325 | /* |
331 | * Put a dummy transaction in the log to tell recovery | 326 | * Put a dummy transaction in the log to tell recovery |
@@ -347,11 +342,11 @@ xfs_commit_dummy_trans( | |||
347 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 342 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
348 | 343 | ||
349 | /* the log force ensures this transaction is pushed to disk */ | 344 | /* the log force ensures this transaction is pushed to disk */ |
350 | xfs_log_force(mp, 0, log_flags); | 345 | xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); |
351 | return error; | 346 | return error; |
352 | } | 347 | } |
353 | 348 | ||
354 | int | 349 | STATIC int |
355 | xfs_sync_fsdata( | 350 | xfs_sync_fsdata( |
356 | struct xfs_mount *mp, | 351 | struct xfs_mount *mp, |
357 | int flags) | 352 | int flags) |
@@ -367,7 +362,7 @@ xfs_sync_fsdata( | |||
367 | if (flags & SYNC_TRYLOCK) { | 362 | if (flags & SYNC_TRYLOCK) { |
368 | ASSERT(!(flags & SYNC_WAIT)); | 363 | ASSERT(!(flags & SYNC_WAIT)); |
369 | 364 | ||
370 | bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); | 365 | bp = xfs_getsb(mp, XBF_TRYLOCK); |
371 | if (!bp) | 366 | if (!bp) |
372 | goto out; | 367 | goto out; |
373 | 368 | ||
@@ -387,7 +382,7 @@ xfs_sync_fsdata( | |||
387 | * become pinned in between there and here. | 382 | * become pinned in between there and here. |
388 | */ | 383 | */ |
389 | if (XFS_BUF_ISPINNED(bp)) | 384 | if (XFS_BUF_ISPINNED(bp)) |
390 | xfs_log_force(mp, 0, XFS_LOG_FORCE); | 385 | xfs_log_force(mp, 0); |
391 | } | 386 | } |
392 | 387 | ||
393 | 388 | ||
@@ -448,9 +443,6 @@ xfs_quiesce_data( | |||
448 | xfs_sync_data(mp, SYNC_WAIT); | 443 | xfs_sync_data(mp, SYNC_WAIT); |
449 | xfs_qm_sync(mp, SYNC_WAIT); | 444 | xfs_qm_sync(mp, SYNC_WAIT); |
450 | 445 | ||
451 | /* drop inode references pinned by filestreams */ | ||
452 | xfs_filestream_flush(mp); | ||
453 | |||
454 | /* write superblock and hoover up shutdown errors */ | 446 | /* write superblock and hoover up shutdown errors */ |
455 | error = xfs_sync_fsdata(mp, SYNC_WAIT); | 447 | error = xfs_sync_fsdata(mp, SYNC_WAIT); |
456 | 448 | ||
@@ -467,16 +459,18 @@ xfs_quiesce_fs( | |||
467 | { | 459 | { |
468 | int count = 0, pincount; | 460 | int count = 0, pincount; |
469 | 461 | ||
462 | xfs_reclaim_inodes(mp, 0); | ||
470 | xfs_flush_buftarg(mp->m_ddev_targp, 0); | 463 | xfs_flush_buftarg(mp->m_ddev_targp, 0); |
471 | xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); | ||
472 | 464 | ||
473 | /* | 465 | /* |
474 | * This loop must run at least twice. The first instance of the loop | 466 | * This loop must run at least twice. The first instance of the loop |
475 | * will flush most meta data but that will generate more meta data | 467 | * will flush most meta data but that will generate more meta data |
476 | * (typically directory updates). Which then must be flushed and | 468 | * (typically directory updates). Which then must be flushed and |
477 | * logged before we can write the unmount record. | 469 | * logged before we can write the unmount record. We also so sync |
470 | * reclaim of inodes to catch any that the above delwri flush skipped. | ||
478 | */ | 471 | */ |
479 | do { | 472 | do { |
473 | xfs_reclaim_inodes(mp, SYNC_WAIT); | ||
480 | xfs_sync_attr(mp, SYNC_WAIT); | 474 | xfs_sync_attr(mp, SYNC_WAIT); |
481 | pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); | 475 | pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); |
482 | if (!pincount) { | 476 | if (!pincount) { |
@@ -575,7 +569,7 @@ xfs_flush_inodes( | |||
575 | igrab(inode); | 569 | igrab(inode); |
576 | xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); | 570 | xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); |
577 | wait_for_completion(&completion); | 571 | wait_for_completion(&completion); |
578 | xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); | 572 | xfs_log_force(ip->i_mount, XFS_LOG_SYNC); |
579 | } | 573 | } |
580 | 574 | ||
581 | /* | 575 | /* |
@@ -591,8 +585,8 @@ xfs_sync_worker( | |||
591 | int error; | 585 | int error; |
592 | 586 | ||
593 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { | 587 | if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { |
594 | xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); | 588 | xfs_log_force(mp, 0); |
595 | xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); | 589 | xfs_reclaim_inodes(mp, 0); |
596 | /* dgc: errors ignored here */ | 590 | /* dgc: errors ignored here */ |
597 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); | 591 | error = xfs_qm_sync(mp, SYNC_TRYLOCK); |
598 | error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); | 592 | error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); |
@@ -690,16 +684,17 @@ void | |||
690 | xfs_inode_set_reclaim_tag( | 684 | xfs_inode_set_reclaim_tag( |
691 | xfs_inode_t *ip) | 685 | xfs_inode_t *ip) |
692 | { | 686 | { |
693 | xfs_mount_t *mp = ip->i_mount; | 687 | struct xfs_mount *mp = ip->i_mount; |
694 | xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); | 688 | struct xfs_perag *pag; |
695 | 689 | ||
690 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); | ||
696 | read_lock(&pag->pag_ici_lock); | 691 | read_lock(&pag->pag_ici_lock); |
697 | spin_lock(&ip->i_flags_lock); | 692 | spin_lock(&ip->i_flags_lock); |
698 | __xfs_inode_set_reclaim_tag(pag, ip); | 693 | __xfs_inode_set_reclaim_tag(pag, ip); |
699 | __xfs_iflags_set(ip, XFS_IRECLAIMABLE); | 694 | __xfs_iflags_set(ip, XFS_IRECLAIMABLE); |
700 | spin_unlock(&ip->i_flags_lock); | 695 | spin_unlock(&ip->i_flags_lock); |
701 | read_unlock(&pag->pag_ici_lock); | 696 | read_unlock(&pag->pag_ici_lock); |
702 | xfs_put_perag(mp, pag); | 697 | xfs_perag_put(pag); |
703 | } | 698 | } |
704 | 699 | ||
705 | void | 700 | void |
@@ -712,12 +707,64 @@ __xfs_inode_clear_reclaim_tag( | |||
712 | XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); | 707 | XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); |
713 | } | 708 | } |
714 | 709 | ||
710 | /* | ||
711 | * Inodes in different states need to be treated differently, and the return | ||
712 | * value of xfs_iflush is not sufficient to get this right. The following table | ||
713 | * lists the inode states and the reclaim actions necessary for non-blocking | ||
714 | * reclaim: | ||
715 | * | ||
716 | * | ||
717 | * inode state iflush ret required action | ||
718 | * --------------- ---------- --------------- | ||
719 | * bad - reclaim | ||
720 | * shutdown EIO unpin and reclaim | ||
721 | * clean, unpinned 0 reclaim | ||
722 | * stale, unpinned 0 reclaim | ||
723 | * clean, pinned(*) 0 requeue | ||
724 | * stale, pinned EAGAIN requeue | ||
725 | * dirty, delwri ok 0 requeue | ||
726 | * dirty, delwri blocked EAGAIN requeue | ||
727 | * dirty, sync flush 0 reclaim | ||
728 | * | ||
729 | * (*) dgc: I don't think the clean, pinned state is possible but it gets | ||
730 | * handled anyway given the order of checks implemented. | ||
731 | * | ||
732 | * As can be seen from the table, the return value of xfs_iflush() is not | ||
733 | * sufficient to correctly decide the reclaim action here. The checks in | ||
734 | * xfs_iflush() might look like duplicates, but they are not. | ||
735 | * | ||
736 | * Also, because we get the flush lock first, we know that any inode that has | ||
737 | * been flushed delwri has had the flush completed by the time we check that | ||
738 | * the inode is clean. The clean inode check needs to be done before flushing | ||
739 | * the inode delwri otherwise we would loop forever requeuing clean inodes as | ||
740 | * we cannot tell apart a successful delwri flush and a clean inode from the | ||
741 | * return value of xfs_iflush(). | ||
742 | * | ||
743 | * Note that because the inode is flushed delayed write by background | ||
744 | * writeback, the flush lock may already be held here and waiting on it can | ||
745 | * result in very long latencies. Hence for sync reclaims, where we wait on the | ||
746 | * flush lock, the caller should push out delayed write inodes first before | ||
747 | * trying to reclaim them to minimise the amount of time spent waiting. For | ||
748 | * background relaim, we just requeue the inode for the next pass. | ||
749 | * | ||
750 | * Hence the order of actions after gaining the locks should be: | ||
751 | * bad => reclaim | ||
752 | * shutdown => unpin and reclaim | ||
753 | * pinned, delwri => requeue | ||
754 | * pinned, sync => unpin | ||
755 | * stale => reclaim | ||
756 | * clean => reclaim | ||
757 | * dirty, delwri => flush and requeue | ||
758 | * dirty, sync => flush, wait and reclaim | ||
759 | */ | ||
715 | STATIC int | 760 | STATIC int |
716 | xfs_reclaim_inode( | 761 | xfs_reclaim_inode( |
717 | struct xfs_inode *ip, | 762 | struct xfs_inode *ip, |
718 | struct xfs_perag *pag, | 763 | struct xfs_perag *pag, |
719 | int sync_mode) | 764 | int sync_mode) |
720 | { | 765 | { |
766 | int error = 0; | ||
767 | |||
721 | /* | 768 | /* |
722 | * The radix tree lock here protects a thread in xfs_iget from racing | 769 | * The radix tree lock here protects a thread in xfs_iget from racing |
723 | * with us starting reclaim on the inode. Once we have the | 770 | * with us starting reclaim on the inode. Once we have the |
@@ -735,33 +782,70 @@ xfs_reclaim_inode( | |||
735 | spin_unlock(&ip->i_flags_lock); | 782 | spin_unlock(&ip->i_flags_lock); |
736 | write_unlock(&pag->pag_ici_lock); | 783 | write_unlock(&pag->pag_ici_lock); |
737 | 784 | ||
738 | /* | ||
739 | * If the inode is still dirty, then flush it out. If the inode | ||
740 | * is not in the AIL, then it will be OK to flush it delwri as | ||
741 | * long as xfs_iflush() does not keep any references to the inode. | ||
742 | * We leave that decision up to xfs_iflush() since it has the | ||
743 | * knowledge of whether it's OK to simply do a delwri flush of | ||
744 | * the inode or whether we need to wait until the inode is | ||
745 | * pulled from the AIL. | ||
746 | * We get the flush lock regardless, though, just to make sure | ||
747 | * we don't free it while it is being flushed. | ||
748 | */ | ||
749 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 785 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
750 | xfs_iflock(ip); | 786 | if (!xfs_iflock_nowait(ip)) { |
787 | if (!(sync_mode & SYNC_WAIT)) | ||
788 | goto out; | ||
789 | xfs_iflock(ip); | ||
790 | } | ||
791 | |||
792 | if (is_bad_inode(VFS_I(ip))) | ||
793 | goto reclaim; | ||
794 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { | ||
795 | xfs_iunpin_wait(ip); | ||
796 | goto reclaim; | ||
797 | } | ||
798 | if (xfs_ipincount(ip)) { | ||
799 | if (!(sync_mode & SYNC_WAIT)) { | ||
800 | xfs_ifunlock(ip); | ||
801 | goto out; | ||
802 | } | ||
803 | xfs_iunpin_wait(ip); | ||
804 | } | ||
805 | if (xfs_iflags_test(ip, XFS_ISTALE)) | ||
806 | goto reclaim; | ||
807 | if (xfs_inode_clean(ip)) | ||
808 | goto reclaim; | ||
809 | |||
810 | /* Now we have an inode that needs flushing */ | ||
811 | error = xfs_iflush(ip, sync_mode); | ||
812 | if (sync_mode & SYNC_WAIT) { | ||
813 | xfs_iflock(ip); | ||
814 | goto reclaim; | ||
815 | } | ||
751 | 816 | ||
752 | /* | 817 | /* |
753 | * In the case of a forced shutdown we rely on xfs_iflush() to | 818 | * When we have to flush an inode but don't have SYNC_WAIT set, we |
754 | * wait for the inode to be unpinned before returning an error. | 819 | * flush the inode out using a delwri buffer and wait for the next |
820 | * call into reclaim to find it in a clean state instead of waiting for | ||
821 | * it now. We also don't return errors here - if the error is transient | ||
822 | * then the next reclaim pass will flush the inode, and if the error | ||
823 | * is permanent then the next sync reclaim will relcaim the inode and | ||
824 | * pass on the error. | ||
755 | */ | 825 | */ |
756 | if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { | 826 | if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { |
757 | /* synchronize with xfs_iflush_done */ | 827 | xfs_fs_cmn_err(CE_WARN, ip->i_mount, |
758 | xfs_iflock(ip); | 828 | "inode 0x%llx background reclaim flush failed with %d", |
759 | xfs_ifunlock(ip); | 829 | (long long)ip->i_ino, error); |
760 | } | 830 | } |
831 | out: | ||
832 | xfs_iflags_clear(ip, XFS_IRECLAIM); | ||
833 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
834 | /* | ||
835 | * We could return EAGAIN here to make reclaim rescan the inode tree in | ||
836 | * a short while. However, this just burns CPU time scanning the tree | ||
837 | * waiting for IO to complete and xfssyncd never goes back to the idle | ||
838 | * state. Instead, return 0 to let the next scheduled background reclaim | ||
839 | * attempt to reclaim the inode again. | ||
840 | */ | ||
841 | return 0; | ||
761 | 842 | ||
843 | reclaim: | ||
844 | xfs_ifunlock(ip); | ||
762 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 845 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
763 | xfs_ireclaim(ip); | 846 | xfs_ireclaim(ip); |
764 | return 0; | 847 | return error; |
848 | |||
765 | } | 849 | } |
766 | 850 | ||
767 | int | 851 | int |